Ejemplo n.º 1
0
def __detect_concepts(input_file, output_file, concepts, window_size):
    '''
    detect tokens and concepts in one file
    '''
    print(input_file)
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    d = detector(ngram(xutils.open_file(input_file), window_size), concepts,
                 window_size)
    f = xutils.open_file(output_file, 'wt')
    for ts in d:
        for t in ts:
            f.write(t)
            if t != '\n': f.write(' ')
    f.close()
Ejemplo n.º 2
0
def preprocess_merge(input_files, output_file):
    fout = xutils.open_file(output_file, 'wt')
    for input_file in input_files:
        fin = xutils.open_file(input_files, 'rt')
        lines = fin
        if 'pmc_ocr' in input_file:
            lines = sent_tokenize(fin.read().replace('-\n',
                                                     '').replace('\n', ' '))
        for line in lines:
            new_line = preprocess_line(line)
            if new_line:
                fout.write(new_line)
                fout.write('\n')
        fin.close()
        fout.write('\n\n')
    fout.close()
Ejemplo n.º 3
0
def extract_pubmed(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    flags = [["<ArticleTitle>", "</ArticleTitle>"],
             ["<AbstractText>", "</AbstractText>"]]
    for fn in glob.glob(input_dir + '/**/*.xml.gz', recursive=True):
        print(fn)
        fin = xutils.open_file(fn, 'rt')
        fout = xutils.open_file(
            output_dir + '/' + os.path.splitext(os.path.basename(fn))[0] +
            '.txt', 'wt')
        for line in fin:
            for flag in flags:
                i = line.find(flag[0])
                while i > 0:
                    j = line.find(flag[1], i)
                    txt = line[i + 14:j]
                    i = line.find(flag[0], j)
                    fout.write(txt)
                    fout.write('\n')
        fout.close()
Ejemplo n.º 4
0
def __corpus_counter(task):
    print(task)
    freq = defaultdict(int)
    with xutils.open_file(task) as file:
        for line in file:
            for word in line.strip().split():
                freq[word] = freq[word] + 1
                tok = word.split('_')
                if len(tok) > 1:
                    for t in tok:
                        freq[t] = freq[t] + 1
    return freq
Ejemplo n.º 5
0
def wikipedia_extract(input_file, output_dir):
    #character chunk
    chunk_size = 50000000
    os.makedirs(output_dir, exist_ok=True)
    fin = xutils.open_file(input_file, 'rt')
    extractor = extract_pages(fin, ['0'])

    fout, counter, chunk = None, chunk_size, -1
    for page in extractor:
        if page[1]:
            text = filter_wiki(page[1])
            if counter >= chunk_size:
                if fout: fout.close()
                counter, chunk = 0, chunk + 1
                output_file = '%s/%s_%d.txt.gz' % (
                    output_dir, os.path.basename(input_file), chunk)
                fout = xutils.open_file(output_file, 'wt')
                print(output_file)

            counter += len(text)
            fout.write(text)
            fout.write('\n\n\n\n')
    fin.close()