Esempio n. 1
0
def get_loadings(agg_doc_vecs_path, agg_dic_vecs_path, out_path, num_features, delimiter='\t'):
    '''

    :param agg_doc_vecs_path: Path to distributed representations of documents
    :param agg_dic_vecs_path: Path to distributed representations of dictionaries
    :param out_path: Path to write to
    :param num_features: Number of dimensions in distributed representations
    :param delimiter: Delimiter to use
    :return:
    '''
    """Get loadings between each document vector in agg-doc_vecs_path and each dictionary dimension in
    agg_dic_vecs_path"""
    n_docs = float(file_length.file_len(agg_doc_vecs_path))
    prog_counter = 0
    counter = 0
    dic_vecs = pd.read_csv(agg_dic_vecs_path, sep=delimiter)
    dic_vecs = dic_vecs.to_dict(orient='list')
    nan_counter = {'ID': [], 'count': 0}

    with open(agg_doc_vecs_path, 'rb') as doc_vecs, open(out_path, 'wb') as out_file:

        doc_vecs_reader = csv.reader(doc_vecs, delimiter='\t')
        doc_vecs_reader.next()

        writer = csv.writer(out_file, delimiter='\t')
        fieldnames_out = ['ID'] + list(dic_vecs.keys())

        writer.writerow(fieldnames_out)

        for doc_vec in doc_vecs_reader:

            if 'nan' in doc_vec:
                nan_counter['count'] += 1
                nan_counter['ID'].append(doc_vec[0])
                pass

            else:
                prog_counter += 1
                counter += 1
                doc_id = doc_vec[0]
                out_row = [doc_id]

                for k in dic_vecs.iterkeys():


                    doc_vec = [np.float64(x) for x in doc_vec[-num_features:]]

                    dic_similarity = cos_similarity(doc_vec, dic_vecs[k])
                    out_row.append(dic_similarity)

                writer.writerow(out_row)
                if prog_counter >= 0.01 * n_docs:
                    prog_counter = 0
                    update_progress(counter / (n_docs - 1))

        print 'Failed to calculate {0} loadings due to missing values.'.format(nan_counter['count'])
        print 'IDs for documents with missing values:\n\n', nan_counter['ID']
Esempio n. 2
0
def doc_vecs_from_txt(input_path, output_path, num_features, model, model_word_set, delimiter='\t', filter_out = []):
    '''

    :param input_path: Path to text file(s) containing texts to be represented.
    This can be a single file or a directory containing multiple files.
    :param output_path:  Path to file where results should be written
    :param num_features: Number of dimensions in word2vec model
    :param model: word2vec model to use for representation
    :param model_word_set: Set of unique words in word2vec model
    :param filter_out: words to be excluded from the representation from
    :return: None. None. This function iteratively writes each document representation to file, no
    object is returned.
    '''

    path_info = get_files(input_path=input_path)

    with open(output_path, 'wb') as out_file:

        fieldnames = ['ID'] + [unicode(fnum) for fnum in range(1, num_features + 1)]
        writer = csv.writer(out_file, delimiter=delimiter)
        writer.writerow(fieldnames)

        for input_path in path_info.itervalues():

            with open(input_path, 'rb') as docs:

                n_lines = float(file_len(input_path))

                print 'Generating aggregate distributed representations of', n_lines, 'texts.'
                update_progress(0 / (n_lines - 1))

                prog_counter = 0
                counter = 0

                cur_id = 0
                n_na = 0

                for row in docs:

                    try:
                        cur_id += 1
                        prog_counter += 1
                        counter += 1
                        row = row[0].split()
                        cur_agg_vec = make_agg_vec(words=row, model=model, num_features=num_features,
                                                   model_word_set=model_word_set, filter_out=[])
                        writer.writerow([cur_id] + list(cur_agg_vec))

                        if prog_counter >= 0.05 * n_lines:
                            prog_counter = 0
                            update_progress(counter / (n_lines - 1))

                    except IndexError:

                        n_na += 1
                        pass
                print "\nFinished calculating aggregate document representations", "\nNumber of NA:", na_na
Esempio n. 3
0
def doc_vecs_from_csv(input_path,
                      output_path,
                      model,
                      num_features,
                      model_word_set,
                      text_col,
                      delimiter,
                      filter_out=[],
                      quotechar=None,
                      id_col=False,
                      header=True):
    """
    Create a distributed representation of each document in a column of documents
    contained in the input file. These representations are written to the file
    specified by the 'output_path' parameter.

    :param input_path: Path to csv containing text to be represented
    :param output_path: Path to file where results should be written
    :param model: word2vec model to use for representation
    :param num_features: Number of dimensions in word2vec model
    :param model_word_set: Set of unique words in word2vec model
    :param text_col: Column containing text to be represented. This can either be a column name or an
    integer representing the column position. Note that column indices begin at 0.
    :param filter_out: Words to exclude from representations
    :param delimiter: Delimiter used to separate columns in the input file
    :param quotechar: If quote character is used to indicate text in the input file,
    specify what character is used.
    :param id_col: If an ID column is included in the input file, specify
    the column it is located in via column name or position. If no ID column is available,
     this should be set to False. When id_col==False, a sequence of range(1:len(text_col))
     will be generated and each representation will be
    associated with a unique integer that corresponds to the row order in the original file.
    :param header: boolean indicating whether the input file contains a header (True) or not (False).
    :return: None. This function iteratively writes each document representation to file, no
    object is returned.
    """

    with open(input_path, 'rb') as docs_file, open(output_path,
                                                   'wb') as out_file:

        docs = csv.reader(docs_file, delimiter=delimiter, quotechar=quotechar)

        if header is True:
            header = docs.next()
            print(header)

            if id_col is not False:
                try:
                    id_col = header.index(id_col)
                except ValueError:
                    try:
                        id_col = int(id_col)
                    except ValueError:
                        print(
                            "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed"
                            .format(id_col))

            try:
                print(text_col)
                text_col = header.index(text_col)
            except ValueError:
                try:
                    text_col = int(text_col)
                except ValueError:
                    print(
                        "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed"
                        .format(text_col))

        if header is False:
            if id_col is not False:
                try:
                    id_col = int(id_col)
                except ValueError:
                    print(
                        "ValueError: Column '{0}' not found, please make sure that the index was correctly listed"
                        .format(id_col))

            try:
                text_col = int(text_col)
            except ValueError:
                print(
                    "ValueError: Column '{0}' not found, please make sure that the index was correctly listed"
                    .format(text_col))

        fieldnames = ['ID'] + [
            unicode(fnum) for fnum in range(1, num_features + 1)
        ]
        writer = csv.writer(out_file, delimiter=delimiter, quotechar=quotechar)
        writer.writerow(fieldnames)

        n_lines = float(file_len(input_path))
        print(n_lines)
        n_na = 0

        print('Generating aggregate distributed representations of', n_lines,
              'texts.')
        update_progress(0 / (n_lines - 1))

        prog_counter = 0
        counter = 0

        if id_col is False:
            cur_id = 0

            for row in docs:
                try:
                    cur_id += 1
                    prog_counter += 1
                    counter += 1

                    doc = row[text_col].split()
                    cur_agg_vec = make_agg_vec(words=doc,
                                               model=model,
                                               num_features=num_features,
                                               model_word_set=model_word_set,
                                               filter_out=[])
                    writer.writerow([cur_id] + list(cur_agg_vec))

                    if prog_counter >= 0.05 * n_lines:
                        prog_counter = 0
                        update_progress(counter / (n_lines - 1))

                except IndexError:
                    n_na += 1
                    pass

        elif id_col is not False:
            for row in docs:
                prog_counter += 1
                counter += 1

                doc = row[text_col].split()
                cur_agg_vec = make_agg_vec(words=doc,
                                           model=model,
                                           num_features=num_features,
                                           model_word_set=model_word_set,
                                           filter_out=[])

                writer.writerow([row[id_col]] + list(cur_agg_vec))

                if prog_counter >= 0.05 * n_lines:
                    prog_counter = 0
                    update_progress(counter / (n_lines - 1))

            print("\nFinished calculating aggregate document representations",
                  "\nNumber NA:", n_na)
Esempio n. 4
0
def get_loadings(agg_doc_vecs_path, agg_dic_vecs_path, out_path, num_features, delimiter='\t'):
    """Get loadings between each document vector in agg-doc_vecs_path and each dictionary dimension in
    agg_dic_vecs_path"""


    n_docs = float(file_length.file_len(agg_doc_vecs_path))
    prog_counter = 0
    counter = 0
    dic_vecs = pd.read_csv(agg_dic_vecs_path, sep=delimiter)
    dic_vecs = dic_vecs.to_dict(orient='list')

    with open(agg_doc_vecs_path, 'rb') as doc_vecs, open(out_path, 'wb') as out_file:

        doc_vecs_reader = csv.reader(doc_vecs, delimiter='\t')
        doc_vecs_reader.next()

        writer = csv.writer(out_file, delimiter='\t')
        fieldnames_out = ['ID'] + dic_vecs.keys()

        writer.writerow(fieldnames_out)

        for doc_vec in doc_vecs_reader:

            prog_counter += 1
            counter += 1
            doc_id = doc_vec[0]
            out_row = [doc_id]

            for dic_vec in dic_vecs.keys():
                doc_vec = [float(x) for x in doc_vec[-num_features:]]
                dic_similarity = cosine_similarity(doc_vec, dic_vecs[dic_vec])[0][0]
                out_row.append(dic_similarity)

            writer.writerow(out_row)

            if prog_counter >= 0.05 * n_docs:
                prog_counter = 0
                update_progress(counter / (n_docs - 1))

        print 'Finished calculating document loadings'


#get_loadings('out_test.txt', 'dic_vecs_out_test.tsv', 'hope.tsv')
#
# if __name__ == "__main__":
#
# # This is not finished.
#
#     if sys.argv[1] is 'make_dic_vecs':
#
#         model, num_features, model_word_set = load_model(model_path=sys.argv[2])
#         dic_terms = getDicTerms(sys.argv[3])
#         dic_vecs = getAggDicVec(dic_terms)
#         writeDicVecs(dic_vecs=dic_vecs, out_path=sys.argv[4])
#
#     elif sys.argv[2] is 'make_doc_vecs':
#
#         model, num_features, model_word_set = load_model(model_path=sys.argv[2])
#         getAggDocVecs(docs_path=sys.argv[3], out_path=sys.argv[4], text_col=sys.argv[5])
#
#
#     elif sys.argv[3] is 'get_loadings':
#
#         model, num_features, model_word_set = load_model(model_path=sys.argv[2])
#         get_loadings(agg_doc_vecs=sys.argv[3], agg_dic_vecs=sys.argv[4], out_path=sys.argv[5])