Example #1
0
File: emt.py Project: emtext/emtext
def extract(raw_uni):
    lines = parse.extract_text(raw_uni.encode("utf-8"))
    lines = map(lambda x: [x.bytes, len(x.text), x.text], lines)
    lines = [[line[2].strip().decode("utf-8", "ignore"), float(line[1]) / line[0], line[1], line[0]] for line in lines]
    fake_line = ["", 0, 0, 0]
    lines = [fake_line] + lines + [fake_line]
    lines = [[lines[i][1], lines[i - 1][1], lines[i + 1][1], lines[i][0]] for i in range(1, len(lines) - 1)]

    for line in lines:
        decision = check(line[:3])
        if decision > 0.4:
            #            print line[0], line[-1]
            yield line[-1]
Example #2
0
def train_from_rss(feeds):
    all_lines = []
    for link, content in feeds:
        text_clean = re.sub('<[^<]+?>', '', content)
        raw = urllib2.urlopen(link).read()
        encoding = chardet.detect(raw)['encoding']
        raw_uni = raw.decode(encoding)
        line_list = parse.extract_text(raw_uni.encode('utf-8'))
        line_list = map(lambda x:[x.bytes, len(x.text), x.text], line_list)
        line_list = [[line[2].strip().decode('utf-8', 'ignore'), float(line[1]) / line[0], line[1], line[0]] for line in line_list]
        line_list = [line + [(1 if line[0] in text_clean else 0)] for line in line_list]
        # 大于2行才有意义
        # 获取最后一个是1的,这样所有判断的才是确定是正确的。
        lines = []
        start = 0
        last = 0
        for index, val in enumerate(line_list):
            if val[-1]:
                last = index
            if val[-1] and not start:
                start = index
#        for index in range(last):
#            print str(line_list[index][5]) + str(line_list[index][1])[:4] + line_list[index][0]

        if len(line_list) > 2:
            fake_line = ['', 0, 0, 0, 0]
            line_list = [fake_line] + line_list + [fake_line]
            lines = [[line_list[i][1], line_list[i - 1][1], line_list[i + 1][1], line_list[i][-1]] for i in range(1, len(line_list) - 1)]

        all_lines += lines
    # Balance positive samples and negative samples:
    positive = [line for line in all_lines if line[-1] > 0]
    negative = [line for line in all_lines if line[-1] <= 0]
    random.shuffle(positive)
    random.shuffle(negative)
    min_len = min(len(positive), len(negative), 100)  # Too many samples may not a good thing.
    all_lines = positive[:min_len] + negative[:min_len]
    random.shuffle(all_lines)
    train(all_lines)
    with open('train.csv', 'w') as f:
        for line in all_lines:
            print line
            f.write(','.join(map(str, (line[0], line[-1]))) + '\n')