def doc_vecs_from_txt(input_path, output_path, num_features, model, model_word_set, delimiter='\t', filter_out=[]): path_info = get_files(input_path=input_path) with open(output_path, 'wb') as out_file: fieldnames = ['ID'] + [ unicode(fnum) for fnum in range(1, num_features + 1) ] writer = csv.writer(out_file, delimiter=delimiter) writer.writerow(fieldnames) for input_path in path_info.itervalues(): with open(input_path, 'rb') as docs: n_lines = float(file_len(input_path)) print 'Generating aggregate distributed representations of', n_lines, 'texts.' update_progress(0 / (n_lines - 1)) prog_counter = 0 counter = 0 cur_id = 0 n_na = 0 for row in docs: try: cur_id += 1 prog_counter += 1 counter += 1 row = row[0].split() cur_agg_vec = make_agg_vec( words=row, model=model, num_features=num_features, model_word_set=model_word_set, filter_out=[]) writer.writerow([cur_id] + list(cur_agg_vec)) if prog_counter >= 0.05 * n_lines: prog_counter = 0 update_progress(counter / (n_lines - 1)) except IndexError: n_na += 1 pass print "\nFinished calculating aggregate document representations", "\nNumber of NA:", na_na
def doc_vecs_from_txt(input_path, output_path, num_features, model, model_word_set, delimiter='\t', filter_out = []): path_info = get_files(input_path=input_path) with open(output_path, 'wb') as out_file: fieldnames = ['ID'] + [unicode(fnum) for fnum in range(1, num_features + 1)] writer = csv.writer(out_file, delimiter=delimiter) writer.writerow(fieldnames) for input_path in path_info.itervalues(): with open(input_path, 'rb') as docs: n_lines = float(file_len(input_path)) print 'Generating aggregate distributed representations of', n_lines, 'texts.' update_progress(0 / (n_lines - 1)) prog_counter = 0 counter = 0 cur_id = 0 n_na = 0 for row in docs: try: cur_id += 1 prog_counter += 1 counter += 1 row = row[0].split() cur_agg_vec = make_agg_vec(words=row, model=model, num_features=num_features, model_word_set=model_word_set, filter_out=[]) writer.writerow([cur_id] + list(cur_agg_vec)) if prog_counter >= 0.05 * n_lines: prog_counter = 0 update_progress(counter / (n_lines - 1)) except IndexError: n_na += 1 pass print "\nFinished calculating aggregate document representations", "\nNumber of NA:", na_na
def doc_vecs_from_csv(input_path, output_path, model, num_features, model_word_set, text_col, filter_out=[], delimiter='\t', id_col=False): """Make aggregate vectors from documents stored in CSV format. Write these to output file""" with open(input_path, 'rb') as docs_file, open(output_path, 'wb') as out_file: dialect = csv.Sniffer().sniff(docs_file.read(1024)) docs_file.seek(0) check_header = csv.Sniffer().has_header(docs_file.read(1024)) docs_file.seek(0) docs = csv.reader(docs_file, dialect) if check_header is True: print 'Header identified' header = docs.next() if id_col is not False: try: id_col = header.index(id_col) except ValueError: try: id_col = int(id_col) except ValueError: print "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed".format( id_col) try: text_col = header.index(text_col) except ValueError: try: text_col = int(text_col) except ValueError: print "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed".format( text_col) elif id_col is False: try: text_col = header.index(text_col) except ValueError: try: text_col = int(text_col) except ValueError: print "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed".format( text_col) print header, head.index(text_col), text_col, int( text_col) if check_header is False: print 'No header identified' if id_col is not False: try: id_col = int(id_col) except ValueError: print "ValueError: Column '{0}' not found, please make sure that the index was correctly listed".format( id_col) try: text_col = int(text_col) except ValueError: print "ValueError: Column '{0}' not found, please make sure that the index was correctly listed".format( text_col) elif id_col is False: try: text_col = int(text_col) except ValueError: print "ValueError: Column '{0}' not found, please make sure that the index was correctly listed".format( text_col) fieldnames = ['ID'] + [ unicode(fnum) for fnum in range(1, num_features + 1) ] writer = csv.writer(out_file, dialect, delimiter=delimiter) writer.writerow(fieldnames) n_lines = float(file_len(input_path)) n_na = 0 print 'Generating aggregate distributed representations of', n_lines, 'texts.' update_progress(0 / (n_lines - 1)) prog_counter = 0 counter = 0 if id_col is False: cur_id = 0 for row in docs: try: cur_id += 1 prog_counter += 1 counter += 1 doc = row[text_col].split() cur_agg_vec = make_agg_vec(words=doc, model=model, num_features=num_features, model_word_set=model_word_set, filter_out=[]) writer.writerow([cur_id] + list(cur_agg_vec)) if prog_counter >= 0.05 * n_lines: prog_counter = 0 update_progress(counter / (n_lines - 1)) except IndexError: n_na += 1 pass elif id_col is not False: for row in docs: try: prog_counter += 1 counter += 1 doc = row[text_col].split() cur_agg_vec = make_agg_vec(words=doc, model=model, num_features=num_features, model_word_set=model_word_set, filter_out=[]) writer.writerow([row[id_col]] + list(cur_agg_vec)) if prog_counter >= 0.05 * n_lines: prog_counter = 0 update_progress(counter / (n_lines - 1)) except IndexError: n_na += 1 pass print "\nFinished calculating aggregate document representations", "\nNumber NA:", n_na
def doc_vecs_from_csv(input_path, output_path, model, num_features, model_word_set, text_col, filter_out = [], delimiter='\t', id_col=False): """Make aggregate vectors from documents stored in CSV format. Write these to output file""" with open(input_path, 'rb') as docs_file, open(output_path, 'wb') as out_file: dialect = csv.Sniffer().sniff(docs_file.read(1024)) docs_file.seek(0) check_header = csv.Sniffer().has_header(docs_file.read(1024)) docs_file.seek(0) docs = csv.reader(docs_file, dialect) if check_header is True: print 'Header identified' header = docs.next() if id_col is not False: try: id_col = header.index(id_col) except ValueError: try: id_col = int(id_col) except ValueError: print "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed".format( id_col) try: text_col = header.index(text_col) except ValueError: try: text_col = int(text_col) except ValueError: print "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed".format( text_col) elif id_col is False: try: text_col = header.index(text_col) except ValueError: try: text_col = int(text_col) except ValueError: print "ValueError: Column '{0}' not found, please make sure that the name or index was correctly listed".format( text_col) print header, head.index(text_col), text_col, int(text_col) if check_header is False: print 'No header identified' if id_col is not False: try: id_col = int(id_col) except ValueError: print "ValueError: Column '{0}' not found, please make sure that the index was correctly listed".format( id_col) try: text_col = int(text_col) except ValueError: print "ValueError: Column '{0}' not found, please make sure that the index was correctly listed".format( text_col) elif id_col is False: try: text_col = int(text_col) except ValueError: print "ValueError: Column '{0}' not found, please make sure that the index was correctly listed".format( text_col) fieldnames = ['ID'] + [unicode(fnum) for fnum in range(1, num_features + 1)] writer = csv.writer(out_file, dialect, delimiter=delimiter) writer.writerow(fieldnames) n_lines = float(file_len(input_path)) n_na = 0 print 'Generating aggregate distributed representations of', n_lines, 'texts.' update_progress(0 / (n_lines - 1)) prog_counter = 0 counter = 0 if id_col is False: cur_id = 0 for row in docs: try: cur_id += 1 prog_counter += 1 counter += 1 doc = row[text_col].split() cur_agg_vec = make_agg_vec(words=doc, model=model, num_features=num_features, model_word_set=model_word_set, filter_out = []) writer.writerow([cur_id] + list(cur_agg_vec)) if prog_counter >= 0.05 * n_lines: prog_counter = 0 update_progress(counter / (n_lines - 1)) except IndexError: n_na += 1 pass elif id_col is not False: for row in docs: try: prog_counter += 1 counter += 1 doc = row[text_col].split() cur_agg_vec = make_agg_vec(words=doc, model=model, num_features=num_features, model_word_set=model_word_set, filter_out = []) writer.writerow([row[id_col]] + list(cur_agg_vec)) if prog_counter >= 0.05 * n_lines: prog_counter = 0 update_progress(counter / (n_lines - 1)) except IndexError: n_na += 1 pass print "\nFinished calculating aggregate document representations", "\nNumber NA:", n_na