def parsefile(f, inPre, titleSet, per, loc, org, other):
    fin = codecs.open(inPre + f, encoding='utf-8')
    for line in fin:
        if len(line.strip().split("\t")) != 11:
            continue
        ID,url,title,source,created_at,authors,key_word,snippets,raw_text,\
            h_tokens_ent,b_tokens_ent = line.strip().split("\t")
        #  h_tokens,b_tokens,\
        if title in titleSet:
            continue
        else:
            titleSet.add(title)
        if len(b_tokens_ent.split()) > MAX_BODY_LEN:
            continue

        h_tokens_ent = unidecode.unidecode(h_tokens_ent.strip())
        b_tokens_ent = unidecode.unidecode(b_tokens_ent.strip())
        #h = grep_ent_with_context(h_tokens_ent,per,loc,org,other)  # fds_per_| asked me about ...
        #b = grep_ent_with_context(b_tokens_ent,per,loc,org,other)
        h = grep_ent(h_tokens_ent, per, loc, org,
                     other)  # fsd_per_| oregon_loc_| ...
        b = grep_ent(b_tokens_ent, per, loc, org, other)
        h = rep2.sub('', h)
        b = rep2.sub('', b)
        h = my_tokenizer(h, tokenizer)
        b = my_tokenizer(b, tokenizer)
        tokens = h + ' ' + h + ' ' + b  # title twice
        yield tokens.lower(), bk.News(
            ID, title, raw_text, snippets, key_word, source, created_at,
            f.split('.')[0], h_tokens_ent,
            b_tokens_ent)  # can also leave lowercase to scikit
    fin.close()
def readfile(file, dataop, count, ind2obj, dtpure, lines):
    #    lines = []
    #    ind2obj = {}
    #    count = 0
    for line in file:
        if len(line.strip().split(
                "\t")) == 10:  #### after dbpedia entities are stored
            ID, url, title, source, created_at, authors, key_word, snippets, raw_text, entities = line.strip(
            ).split("\t")
        elif len(line.strip().split("\t")) == 9:
            ID, url, title, source, created_at, authors, key_word, snippets, raw_text = line.strip(
            ).split("\t")
        else:
            continue
        strAll = (title + raw_text).lower()
        interested = False
        if 'powerball' in strAll:  # and 'benghazi' in strAll:
            interested = True
        if not interested:
            continue
        ID = int(ID)
        ind2obj[count] = bk.News(ID, title, raw_text, snippets, key_word,
                                 source, created_at, dtpure)
        if dataop == "all":
            lines.append(line)
        if dataop == "text":
            lines.append('\t'.join([url, title, key_word, snippets, raw_text]))
        if dataop == "snippets":
            lines.append('\t'.join([url, title, key_word, snippets]))
        count += 1
    return count
def parsefile(f,per,loc,org,other):
    fin = codecs.open(f, encoding = 'utf-8')
    for line in fin:
        #if len(b_tokens.split()) > MAX_BODY_LEN:            
        #    continue
        b_tokens_ent = unidecode.unidecode(line.strip())
        b = grep_ent(b_tokens_ent,per,loc,org,other)
        b = rep2.sub('', b)
        b = my_tokenizer(b, tokenizer)
        yield b.lower(),bk.News(raw_text=line.strip()) # can also leave lowercase to scikit
    fin.close()
def readfile(file,dataop,count,ind2obj,lines):
#    lines = []
#    ind2obj = {}
#    count = 0
    for line in file:
        if len(line.strip().split("\t")) != 9:
            continue
        ID,url,title,source,created_at,authors,key_word,snippets,raw_text = line.strip().split("\t")
        ind2obj[count] = bk.News(ID,title,raw_text,snippets,key_word,source,created_at)
        if dataop == "all":
            lines.append(line)
        if dataop == "text":
            lines.append('\t'.join([url,title,keywords,snippets,raw_text]))
        if dataop == "snippets":
            lines.append('\t'.join([url,title,keywords,snippets]))
        count += 1
    return count
Beispiel #5
0
def readfile(file, dataop, count, ind2obj, dtpure, lines):
    #    lines = []
    #    ind2obj = {}
    #    count = 0
    for line in file:
        if len(line.strip().split(
                "\t")) != 10:  #### after dbpedia entities are stored
            continue
        ID, url, title, source, created_at, authors, key_word, snippets, raw_text, entities = line.strip(
        ).split("\t")
        ID = int(ID)
        ind2obj[count] = bk.News(ID, title, raw_text, snippets, key_word,
                                 source, created_at, dtpure)
        if dataop == "all":
            lines.append(line)
        if dataop == "text":
            lines.append('\t'.join([url, title, key_word, snippets, raw_text]))
        if dataop == "snippets":
            lines.append('\t'.join([url, title, key_word, snippets]))
        count += 1
    return count