def read_and_train_doc2vec(root_dir, fileids, output_file='', options={}): fileids = fileids if isinstance(fileids, list) else [fileids] fileids = [unicode(f, 'utf8') for f in fileids] output_file = output_file or '-'.join(fileids) output_file = u"{0}{1}-{2}".format(MODELS_DIR, output_file, options_to_string(options)) reader = PlaintextCorpusReader(root=root_dir, fileids=fileids) try: docs = [ TaggedDocument(reader.words(fileid), [fileid]) for fileid in fileids ] train_and_save_doc2vec(docs, output_file, options) except UnicodeDecodeError: file_encodings = {} for fileid in fileids: file_content = open(root_dir + fileid).read() file_encoding = chardet.detect(file_content) file_encodings[fileid] = file_encoding['encoding'] reader._encoding = file_encodings pdb.set_trace() docs = [ TaggedDocument(reader.words(fileid), [fileid]) for fileid in fileids ] train_and_save_doc2vec(docs, output_file, options)
def read_and_train(root_dir, fileids, output_file='', options={}): fileids = fileids if isinstance(fileids, list) else [fileids] fileids = [unicode(f, 'utf8') for f in fileids] output_file = output_file or '-'.join(fileids) output_file = u"{0}{1}-{2}".format(MODELS_DIR, output_file, options_to_string(options)) reader = PlaintextCorpusReader(root=root_dir, fileids=fileids) try: sents = reader.sents() print fileids train_and_save(sents, output_file, options) except UnicodeDecodeError: print "here" file_encodings = {} for fileid in fileids: file_content = open(root_dir + fileid).read() file_encoding = chardet.detect(file_content) file_encodings[fileid] = file_encoding['encoding'] reader._encoding = file_encodings sents = reader.sents() train_and_save(sents, output_file, options)