def transform_thread(n_processors): queries = {} dictionary = load_doc_hashes("doc_mapper.txt") with open("doc_model2.pickle") as f: doc_model = pickle.load(f) M = doc_model.M b = doc_model.b model = models.Word2Vec.load("content.word2vec") dim = len(model[model.vocab.keys()[0]]) R = io.mmread("R_old.mtx").T files = sorted(glob.glob(os.getcwd() + "/feature_*")) signals = Parallel(n_jobs=n_processors)(delayed(transform_to_signal)(fname,queries,dictionary,M,b,model,dim,R) for fname in files) return signals
__author__ = 'Martin' from doc2vec.doc_to_vec import load_doc_hashes from doc2vec.doc_to_vec import MySentences class MyWords(MySentences): def __init__(self,fname): self.fname = fname def __iter__(self): with open(self.fname) as f: for line in f: yield line if __name__ == "__main__": features = MyWords("temp_features.rtData") documents = [line for line in open("content.raw_text")] docs = load_doc_hashes("doc_mapper.txt") rows = [] for row in features: parse_row = row.split("#",1) metadata_array = parse_row[1].strip().split('\t') qid = parse_row[0].strip().split(':', 1)[1].split(' ',1)[0] hash = metadata_array[2] rows.append(documents[docs[hash]]) with open("temp_all.raw_text","w") as f: for r in rows: f.write(r) f.write("\n")
from doc2vec.doc_to_vec import load_doc_hashes from doc2vec.doc_to_vec import MySentences class MyWords(MySentences): def __init__(self, fname): self.fname = fname def __iter__(self): with open(self.fname) as f: for line in f: yield line if __name__ == "__main__": features = MyWords("temp_features.rtData") documents = [line for line in open("content.raw_text")] docs = load_doc_hashes("doc_mapper.txt") rows = [] for row in features: parse_row = row.split("#", 1) metadata_array = parse_row[1].strip().split("\t") qid = parse_row[0].strip().split(":", 1)[1].split(" ", 1)[0] hash = metadata_array[2] rows.append(documents[docs[hash]]) with open("temp_all.raw_text", "w") as f: for r in rows: f.write(r) f.write("\n")