Ejemplo n.º 1
0
def process_filelist(source_dir, data_dir, filelist, start, end, crash=False):
    for n, fname in elements(filelist, start, end):
        if crash:
            process_list_element(data_dir, n, fname)
        else:
            try:
                process_list_element(source_dir, data_dir, n, fname)
            except Exception as e:
                sys.stderr.write("ERROR on %07d  %s\n" % (n, fname))
                print('ERROR:', Exception, e)
Ejemplo n.º 2
0
def run_tarsqi(data_dir, filelist, start, end, crash=False):
    print("$ python3 %s" % ' '.join(sys.argv))
    for n, fname in elements(filelist, start, end):
        print_element(n, fname)
        if crash:
            run_tarsqi_for_file(data_dir, fname)
        else:
            try:
                run_tarsqi_for_file(data_dir, fname)
            except Exception as e:
                print('ERROR:', Exception, e)
def _collect_data(data_dir, filelist, start, end):
    all_data = []
    # especially the first two occur  in most abstracts so let's ignore them
    words_to_ignore = {'title', 'abstract', 'result', 'study'}
    for n, fname in elements(filelist, start, end):
        print("%07d  %s" % (n, fname))
        fpath = os.path.join(data_dir, 'lif', fname[:-4] + '.lif')
        lif = Container(fpath).payload
        text_data = prepare_text_for_lda(lif.text.value)
        text_data = [w for w in text_data if w not in words_to_ignore]
        all_data.append(text_data)
    token_count = sum([len(d) for d in all_data])
    print('\nToken count = %d' % token_count)
    return all_data
def create_documents(data_dir, filelist, start, end, crash=False):
    print("$ python3 %s\n" % ' '.join(sys.argv))
    ela_dir = os.path.join(data_dir, 'ela')
    if not os.path.exists(ela_dir):
        os.mkdir(ela_dir)
    for n, fname in elements(filelist, start, end):
        print_element(n, fname)
        if crash:
            create_document(data_dir, fname)
        else:
            try:
                create_document(data_dir, fname)
            except Exception as e:
                print('ERROR:', Exception, e)
def generate_topics(data_dir, filelist, start, end, crash=False):
    print("$ python3 %s\n" % ' '.join(sys.argv))
    lda = load_model()
    topic_idx = {topic_id: topic for topic_id, topic
                 in lda.print_topics(num_topics=NUM_TOPICS)}
    dictionary = load_dictionary()
    for n, fname in elements(filelist, start, end):
        print_element(n, fname)
        if crash:
            generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary)
        else:
            try:
                generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary)
            except Exception as e:
                print('ERROR:', Exception, e)
                sys.stderr.write("ERROR on %07d  %s\n" % (n, fname))
Ejemplo n.º 6
0
def process_filelist(source_dir,
                     data_dir,
                     filelist,
                     start,
                     end,
                     crash=False,
                     test=False):
    print("$ python3 %s\n" % ' '.join(sys.argv))
    for n, fname in elements(filelist, start, end):
        print_element(n, fname)
        if crash:
            process_list_element(source_dir, data_dir, fname, test=test)
        else:
            try:
                process_list_element(source_dir, data_dir, fname, test=test)
            except Exception as e:
                print('ERROR:', Exception, e)
Ejemplo n.º 7
0
def generate_topics(data_dir, filelist, start, end, crash=False):
    lda = load_model()
    topic_idx = {
        topic_id: topic
        for topic_id, topic in lda.print_topics(num_topics=NUM_TOPICS)
    }
    dictionary = load_dictionary()
    for n, fname in elements(filelist, start, end):
        print_element(n, fname)
        if crash:
            generate_topics_for_file(data_dir, fname, lda, topic_idx,
                                     dictionary)
        else:
            try:
                generate_topics_for_file(data_dir, fname, lda, topic_idx,
                                         dictionary)
            except Exception as e:
                print('ERROR:', Exception, e)