Example #1
0
def doc_vecs_from_txt(input_path,
                      output_path,
                      num_features,
                      model,
                      model_word_set,
                      delimiter='\t',
                      filter_out=[]):

    path_info = get_files(input_path=input_path)

    with open(output_path, 'wb') as out_file:

        fieldnames = ['ID'] + [
            unicode(fnum) for fnum in range(1, num_features + 1)
        ]
        writer = csv.writer(out_file, delimiter=delimiter)
        writer.writerow(fieldnames)

        for input_path in path_info.itervalues():

            with open(input_path, 'rb') as docs:

                n_lines = float(file_len(input_path))

                print 'Generating aggregate distributed representations of', n_lines, 'texts.'
                update_progress(0 / (n_lines - 1))

                prog_counter = 0
                counter = 0

                cur_id = 0
                n_na = 0

                for row in docs:

                    try:
                        cur_id += 1
                        prog_counter += 1
                        counter += 1
                        row = row[0].split()
                        cur_agg_vec = make_agg_vec(
                            words=row,
                            model=model,
                            num_features=num_features,
                            model_word_set=model_word_set,
                            filter_out=[])
                        writer.writerow([cur_id] + list(cur_agg_vec))

                        if prog_counter >= 0.05 * n_lines:
                            prog_counter = 0
                            update_progress(counter / (n_lines - 1))

                    except IndexError:

                        n_na += 1
                        pass
                print "\nFinished calculating aggregate document representations", "\nNumber of NA:", na_na
Example #2
0
def doc_vecs_from_txt(input_path, output_path, num_features, model, model_word_set, delimiter='\t', filter_out = []):

    path_info = get_files(input_path=input_path)

    with open(output_path, 'wb') as out_file:

        fieldnames = ['ID'] + [unicode(fnum) for fnum in range(1, num_features + 1)]
        writer = csv.writer(out_file, delimiter=delimiter)
        writer.writerow(fieldnames)

        for input_path in path_info.itervalues():

            with open(input_path, 'rb') as docs:

                n_lines = float(file_len(input_path))

                print 'Generating aggregate distributed representations of', n_lines, 'texts.'
                update_progress(0 / (n_lines - 1))

                prog_counter = 0
                counter = 0

                cur_id = 0
                n_na = 0

                for row in docs:

                    try:
                        cur_id += 1
                        prog_counter += 1
                        counter += 1
                        row = row[0].split()
                        cur_agg_vec = make_agg_vec(words=row, model=model, num_features=num_features,
                                                   model_word_set=model_word_set, filter_out=[])
                        writer.writerow([cur_id] + list(cur_agg_vec))

                        if prog_counter >= 0.05 * n_lines:
                            prog_counter = 0
                            update_progress(counter / (n_lines - 1))

                    except IndexError:

                        n_na += 1
                        pass
                print "\nFinished calculating aggregate document representations", "\nNumber of NA:", na_na
Example #3
0
def doc_vecs_from_csv(input_path,
                      output_path,
                      model,
                      num_features,
                      model_word_set,
                      text_col,
                      filter_out=[],
                      delimiter='\t',
                      id_col=False):
    """Make aggregate vectors from documents stored in CSV format. Write these to output file"""

    with open(input_path, 'rb') as docs_file, open(output_path,
                                                   'wb') as out_file:

        dialect = csv.Sniffer().sniff(docs_file.read(1024))
        docs_file.seek(0)

        check_header = csv.Sniffer().has_header(docs_file.read(1024))
        docs_file.seek(0)

        docs = csv.reader(docs_file, dialect)

        if check_header is True:
            print 'Header identified'

            header = docs.next()

            if id_col is not False:

                try:
                    id_col = header.index(id_col)

                except ValueError:

                    try:
                        id_col = int(id_col)

                    except ValueError:

                        print "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed".format(
                            id_col)
                try:

                    text_col = header.index(text_col)

                except ValueError:

                    try:
                        text_col = int(text_col)

                    except ValueError:

                        print "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed".format(
                            text_col)

            elif id_col is False:

                try:
                    text_col = header.index(text_col)

                except ValueError:

                    try:
                        text_col = int(text_col)

                    except ValueError:

                        print "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed".format(
                            text_col)
                        print header, head.index(text_col), text_col, int(
                            text_col)

        if check_header is False:
            print 'No header identified'

            if id_col is not False:

                try:

                    id_col = int(id_col)

                except ValueError:

                    print "ValueError: Column '{0}' not found, please make sure that the index was correctly listed".format(
                        id_col)

                try:
                    text_col = int(text_col)

                except ValueError:

                    print "ValueError: Column '{0}' not found, please make sure that the index was correctly listed".format(
                        text_col)

            elif id_col is False:

                try:
                    text_col = int(text_col)

                except ValueError:

                    print "ValueError: Column '{0}' not found, please make sure that the index was correctly listed".format(
                        text_col)

        fieldnames = ['ID'] + [
            unicode(fnum) for fnum in range(1, num_features + 1)
        ]
        writer = csv.writer(out_file, dialect, delimiter=delimiter)
        writer.writerow(fieldnames)

        n_lines = float(file_len(input_path))
        n_na = 0

        print 'Generating aggregate distributed representations of', n_lines, 'texts.'
        update_progress(0 / (n_lines - 1))

        prog_counter = 0
        counter = 0

        if id_col is False:

            cur_id = 0

            for row in docs:

                try:
                    cur_id += 1
                    prog_counter += 1
                    counter += 1

                    doc = row[text_col].split()
                    cur_agg_vec = make_agg_vec(words=doc,
                                               model=model,
                                               num_features=num_features,
                                               model_word_set=model_word_set,
                                               filter_out=[])

                    writer.writerow([cur_id] + list(cur_agg_vec))

                    if prog_counter >= 0.05 * n_lines:
                        prog_counter = 0
                        update_progress(counter / (n_lines - 1))

                except IndexError:

                    n_na += 1
                    pass

        elif id_col is not False:

            for row in docs:

                try:

                    prog_counter += 1
                    counter += 1

                    doc = row[text_col].split()
                    cur_agg_vec = make_agg_vec(words=doc,
                                               model=model,
                                               num_features=num_features,
                                               model_word_set=model_word_set,
                                               filter_out=[])

                    writer.writerow([row[id_col]] + list(cur_agg_vec))

                    if prog_counter >= 0.05 * n_lines:
                        prog_counter = 0
                        update_progress(counter / (n_lines - 1))

                except IndexError:
                    n_na += 1
                    pass

            print "\nFinished calculating aggregate document representations", "\nNumber NA:", n_na
Example #4
0
def doc_vecs_from_csv(input_path, output_path, model, num_features, model_word_set, text_col, filter_out = [], delimiter='\t',
                  id_col=False):
    """Make aggregate vectors from documents stored in CSV format. Write these to output file"""

    with open(input_path, 'rb') as docs_file, open(output_path, 'wb') as out_file:

        dialect = csv.Sniffer().sniff(docs_file.read(1024))
        docs_file.seek(0)

        check_header = csv.Sniffer().has_header(docs_file.read(1024))
        docs_file.seek(0)

        docs = csv.reader(docs_file, dialect)

        if check_header is True:
            print 'Header identified'

            header = docs.next()

            if id_col is not False:

                try:
                    id_col = header.index(id_col)

                except ValueError:

                    try:
                        id_col = int(id_col)

                    except ValueError:

                        print "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed".format(
                            id_col)
                try:

                    text_col = header.index(text_col)

                except ValueError:

                    try:
                        text_col = int(text_col)

                    except ValueError:

                        print "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed".format(
                            text_col)

            elif id_col is False:

                try:
                    text_col = header.index(text_col)

                except ValueError:

                    try:
                        text_col = int(text_col)

                    except ValueError:

                        print "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed".format(
                            text_col)
                        print header, head.index(text_col), text_col, int(text_col)

        if check_header is False:
            print 'No header identified'

            if id_col is not False:

                try:

                    id_col = int(id_col)

                except ValueError:

                    print "ValueError: Column '{0}' not found, please make sure that the index was correctly listed".format(
                        id_col)

                try:
                    text_col = int(text_col)

                except ValueError:

                    print "ValueError: Column '{0}' not found, please make sure that the index was correctly listed".format(
                        text_col)

            elif id_col is False:

                try:
                    text_col = int(text_col)

                except ValueError:

                    print "ValueError: Column '{0}' not found, please make sure that the index was correctly listed".format(
                        text_col)


        fieldnames = ['ID'] + [unicode(fnum) for fnum in range(1, num_features + 1)]
        writer = csv.writer(out_file, dialect, delimiter=delimiter)
        writer.writerow(fieldnames)


        n_lines = float(file_len(input_path))
        n_na = 0

        print 'Generating aggregate distributed representations of', n_lines, 'texts.'
        update_progress(0 / (n_lines - 1))

        prog_counter = 0
        counter = 0

        if id_col is False:

            cur_id = 0

            for row in docs:

                try:
                    cur_id += 1
                    prog_counter += 1
                    counter += 1

                    doc = row[text_col].split()
                    cur_agg_vec = make_agg_vec(words=doc, model=model, num_features=num_features, model_word_set=model_word_set, filter_out = [])

                    writer.writerow([cur_id] + list(cur_agg_vec))

                    if prog_counter >= 0.05 * n_lines:
                        prog_counter = 0
                        update_progress(counter / (n_lines - 1))

                except IndexError:

                    n_na += 1
                    pass


        elif id_col is not False:

            for row in docs:

                try:

                    prog_counter += 1
                    counter += 1

                    doc = row[text_col].split()
                    cur_agg_vec = make_agg_vec(words=doc, model=model, num_features=num_features, model_word_set=model_word_set, filter_out = [])

                    writer.writerow([row[id_col]] + list(cur_agg_vec))

                    if prog_counter >= 0.05 * n_lines:
                        prog_counter = 0
                        update_progress(counter / (n_lines - 1))

                except IndexError:
                    n_na += 1
                    pass

            print "\nFinished calculating aggregate document representations", "\nNumber NA:", n_na