Ejemplo n.º 1
0
def transform_thread(n_processors):
    queries = {}
    dictionary = load_doc_hashes("doc_mapper.txt")
    with open("doc_model2.pickle") as f:
        doc_model = pickle.load(f)
    M = doc_model.M
    b = doc_model.b
    model = models.Word2Vec.load("content.word2vec")
    dim = len(model[model.vocab.keys()[0]])
    R = io.mmread("R_old.mtx").T
    files = sorted(glob.glob(os.getcwd() + "/feature_*"))
    signals = Parallel(n_jobs=n_processors)(delayed(transform_to_signal)(fname,queries,dictionary,M,b,model,dim,R) for fname in files)
    return signals
Ejemplo n.º 2
0
__author__ = 'Martin'


from doc2vec.doc_to_vec import load_doc_hashes
from doc2vec.doc_to_vec import MySentences

class MyWords(MySentences):
    def __init__(self,fname):
        self.fname = fname

    def __iter__(self):
        with open(self.fname) as f:
            for line in f:
                yield line

if __name__ == "__main__":
    features = MyWords("temp_features.rtData")
    documents = [line for line in open("content.raw_text")]
    docs = load_doc_hashes("doc_mapper.txt")
    rows = []
    for row in features:
        parse_row = row.split("#",1)
        metadata_array = parse_row[1].strip().split('\t')
        qid = parse_row[0].strip().split(':', 1)[1].split(' ',1)[0]
        hash = metadata_array[2]
        rows.append(documents[docs[hash]])
    with open("temp_all.raw_text","w") as f:
        for r in rows:
            f.write(r)
            f.write("\n")
Ejemplo n.º 3
0
from doc2vec.doc_to_vec import load_doc_hashes
from doc2vec.doc_to_vec import MySentences


class MyWords(MySentences):
    def __init__(self, fname):
        self.fname = fname

    def __iter__(self):
        with open(self.fname) as f:
            for line in f:
                yield line


if __name__ == "__main__":
    features = MyWords("temp_features.rtData")
    documents = [line for line in open("content.raw_text")]
    docs = load_doc_hashes("doc_mapper.txt")
    rows = []
    for row in features:
        parse_row = row.split("#", 1)
        metadata_array = parse_row[1].strip().split("\t")
        qid = parse_row[0].strip().split(":", 1)[1].split(" ", 1)[0]
        hash = metadata_array[2]
        rows.append(documents[docs[hash]])
    with open("temp_all.raw_text", "w") as f:
        for r in rows:
            f.write(r)
            f.write("\n")