def create_queries_vec(project): corpus_fname_base = project.full_path + 'Queries' corpus_fname = corpus_fname_base + '.ordered.gz' dict_fname = corpus_fname_base + '.dict.gz' if not os.path.exists(corpus_fname): pp = GeneralCorpus(lazy_dict=True) id2word = Dictionary() with open(os.path.join(project.full_path, 'ids.txt')) as f: ids = [x.strip() for x in f.readlines()] queries = list() for id in ids: with open(os.path.join(project.full_path, 'queries', 'ShortDescription' + id + '.txt')) as f: short = f.read() with open(os.path.join(project.full_path, 'queries', 'LongDescription' + id + '.txt')) as f: long = f.read() text = ' '.join([short, long]) text = list(pp.preprocess(text)) queries.append((text, (id, 'query'))) OrderedCorpus.serialize(corpus_fname, queries, metadata=True) corpus = OrderedCorpus(corpus_fname) return corpus
def create_queries_vec(project): corpus_fname_base = project.full_path + 'Queries' corpus_fname = corpus_fname_base + '.ordered.gz' dict_fname = corpus_fname_base + '.dict.gz' if not os.path.exists(corpus_fname): pp = GeneralCorpus(lazy_dict=True) id2word = Dictionary() with open(os.path.join(project.full_path, 'ids.txt')) as f: ids = [x.strip() for x in f.readlines()] queries = list() for id in ids: with open( os.path.join(project.full_path, 'queries', 'ShortDescription' + id + '.txt')) as f: short = f.read() with open( os.path.join(project.full_path, 'queries', 'LongDescription' + id + '.txt')) as f: long = f.read() text = ' '.join([short, long]) text = list(pp.preprocess(text)) queries.append((text, (id, 'query'))) OrderedCorpus.serialize(corpus_fname, queries, metadata=True) corpus = OrderedCorpus(corpus_fname) return corpus
def create_queries(project): corpus_fname_base = project.full_path + 'Queries' corpus_fname = corpus_fname_base + '.ordered.gz' dict_fname = corpus_fname_base + '.dict.gz' if not os.path.exists(corpus_fname): pp = GeneralCorpus(lazy_dict=True) id2word = Dictionary() with open(os.path.join(project.full_path, 'ids.txt')) as f: ids = [x.strip() for x in f.readlines()] queries = list() for id in ids: with open( os.path.join(project.full_path, 'queries', 'ShortDescription' + id + '.txt')) as f: short = f.read() with open( os.path.join(project.full_path, 'queries', 'LongDescription' + id + '.txt')) as f: long = f.read() text = ' '.join([short, long]) text = pp.preprocess(text) # this step will remove any words not found in the dictionary bow = id2word.doc2bow(text, allow_update=True) queries.append((bow, (id, 'query'))) # write the corpus and dictionary to disk. this will take awhile. MalletCorpus.serialize(corpus_fname, queries, id2word=id2word, metadata=True) # re-open the compressed versions of the dictionary and corpus id2word = None if os.path.exists(dict_fname): id2word = Dictionary.load(dict_fname) corpus = MalletCorpus(corpus_fname, id2word=id2word) return corpus
def create_queries(project): corpus_fname_base = project.full_path + 'Queries' corpus_fname = corpus_fname_base + '.ordered.gz' dict_fname = corpus_fname_base + '.dict.gz' if not os.path.exists(corpus_fname): pp = GeneralCorpus(lazy_dict=True) id2word = Dictionary() with open(os.path.join(project.full_path, 'ids.txt')) as f: ids = [x.strip() for x in f.readlines()] queries = list() for id in ids: with open(os.path.join(project.full_path, 'queries', 'ShortDescription' + id + '.txt')) as f: short = f.read() with open(os.path.join(project.full_path, 'queries', 'LongDescription' + id + '.txt')) as f: long = f.read() text = ' '.join([short, long]) text = pp.preprocess(text) # this step will remove any words not found in the dictionary bow = id2word.doc2bow(text, allow_update=True) queries.append((bow, (id, 'query'))) # write the corpus and dictionary to disk. this will take awhile. MalletCorpus.serialize(corpus_fname, queries, id2word=id2word, metadata=True) # re-open the compressed versions of the dictionary and corpus id2word = None if os.path.exists(dict_fname): id2word = Dictionary.load(dict_fname) corpus = MalletCorpus(corpus_fname, id2word=id2word) return corpus