def retriever(file_dir): analyzer = WhitespaceAnalyzer() reader = DirectoryReader.open( SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/"))) searcher = IndexSearcher(reader) queryParser = QueryParser("code", analyzer) BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE) with open(file_dir + "/train/train.spl.src", 'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu: sources = [line.strip() for line in fso.readlines()] summaries = [line.strip() for line in fsu.readlines()] with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \ open(file_dir+"/output/ast.out", 'w') as fws: queries = [ re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip()) for line in ft.readlines() ] for i, line in enumerate(queries): print("query %d" % i) query = queryParser.parse(QueryParser.escape(line)) hits = searcher.search(query, 1).scoreDocs flag = False for hit in hits: doc = searcher.doc(hit.doc) _id = eval(doc.get("id")) flag = True fwo.write(sources[_id] + '\n') fws.write(summaries[_id] + '\n') if not flag: print(query) print(hits) exit(-1)
def __init__(self): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print 'Creating index at', prm.index_folder if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print 'copying index from', prm.index_folder, 'to', prm.local_index_folder if os.path.exists(prm.local_index_folder): print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print 'Creating index at', prm.index_folder_term self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term if os.path.exists(prm.local_index_folder_term): print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher( DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print 'Loading Title-ID mapping...' self.title_id_map, self.id_title_map = self.get_title_id_map()
def run_aud(searcher, analyzer, path): print 1 print "Searching for:", path p = resolve.voice() p.loaddata(path) p.fft() command = str(p.high_point[:-1]) command = command[1:-1] command = '+'.join(command.split('(')) command = '+'.join(command.split(')')) BooleanQuery.setMaxClauseCount(100000) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs geshou = [] geming = [] zhuanji = [] liupai = [] shijian = [] jianjie = [] geci = [] imgurl = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) geming.append(doc.get("song_name")) geshou.append(doc.get('singer_name')) zhuanji.append(doc.get('album_name')) liupai.append(doc.get('style')) shijian.append(doc.get('time')) geci.append(doc.get('lyrics')) jianjie.append(doc.get('describe')) imgurl.append(doc.get("img")) print geci[0] print jianjie return geshou, geming, zhuanji, liupai, shijian, jianjie, geci, imgurl
def __init__(self, indexDir, computeLengthNorm=True): # 初始化 indexDir-索引文件目录 computeLengthNorm-是否应用SIM(true-不用 false-应用) # if not jpype.isJVMStarted(): # lucene.initVM() lucene.getVMEnv().attachCurrentThread() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # 标准分词 在针对英文时 以分隔符分词 self.path = os.path.join(INDEX_PATH, indexDir) # 存储路径 self.store = SimpleFSDirectory(File(self.path)) # 存储***? # self.reader = DirectoryReader.open(self.store) self.reader = IndexReader.open(self.store) self.numDocs = self.reader.maxDoc() self.searcher = IndexSearcher(self.reader) # IndexSearch类 sim = CustomSimilarity() # addby zmq if not computeLengthNorm: # SIM sim = CustomSimilarity() self.searcher.setSimilarity(sim) self.mlt = MoreLikeThis(self.reader, sim) # mlt? self.mlt.setAnalyzer(self.analyzer) self.mlt.setMinTermFreq(1) self.mlt.setMinDocFreq(1) # debug self.mlt.setMinWordLen(1) self.mlt.setMaxNumTokensParsed(100000000) BooleanQuery.setMaxClauseCount(1024 * 1024) # 修改最长query clause BUG
def __init__(self, lucene_vm, index_dir): lucene_vm.attachCurrentThread() BooleanQuery.setMaxClauseCount(2**16) # to avoid 'too many boolean clauses' self.index_dir = index_dir #self.fields = dict() #self.fields['doc_id'] = FieldType(StringField) #self.fields['doc_id'].setStored(True) #self.fields['doc_id'].setIndexOptions(IndexOptions.DOCS) #self.fields['content'] = FieldType(TextField) # self.fields['content'].setStored(True) #self.fields['content'].setIndexOptions(IndexOptions.DOCS_AND_FREQS) # self.fields['content'].setStoreTermVectors(True) self.directory = SimpleFSDirectory(Paths.get(self.index_dir)) self.analyzer = WhitespaceAnalyzer() self.parser = QueryParser('content', self.analyzer) # self.parser.setDefaultOperator(QueryParser.Operator.AND) self.writer = None self.reader = None self.searcher = None
class DocSimilarity(object): def __init__(self): # lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.boolean_query = BooleanQuery() self.similarityOfSynopsis() self.similarityOfStoryLine() def similarityOfSynopsis(self): directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.SYNOPSIS): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) while True: try: query = queryParser.parse( QueryParser.escape(content)) except Exception as e: self.boolean_query.setMaxClauseCount( self.boolean_query.maxClauseCount * 2) print self.boolean_query.maxClauseCount continue break topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter( first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter( first_movie=minor_movie, second_movie=major_movie).first() similarity.synopsis = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id) def similarityOfStoryLine(self): directory = SimpleFSDirectory(File(settings.STORYLINE_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.STORYLINE): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) query = queryParser.parse(QueryParser.escape(content)) topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter( first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter( first_movie=minor_movie, second_movie=major_movie).first() similarity.storyline = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id)
def __recs_query(self, positive_rated_document_list, scores, recs_number, items_directory, candidate_list: List) -> pd.DataFrame: """ Builds a query using the contents that the user liked. The terms relative to the contents that the user liked are boosted by the rating he/she gave. A filter clause is added to the query to consider only candidate items Args: positive_rated_document_list: List of contents that the user liked scores: Ratings given by the user recs_number: How many items must be recommended. You can only specify the number, not a specific item for which compute the prediction items_directory: Directory where the items are stored Returns: score_frame (pd.DataFrame): DataFrame containing the recommendations for the user """ BooleanQuery.setMaxClauseCount(2000000) searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory( Paths.get(items_directory)))) if self.__classic_similarity: searcher.setSimilarity(ClassicSimilarity()) field_list = searcher.doc(positive_rated_document_list[0]).getFields() user_fields = {} field_parsers = {} analyzer = SimpleAnalyzer() for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] = field.stringValue() field_parsers[field.name()] = QueryParser(field.name(), analyzer) positive_rated_document_list.remove(positive_rated_document_list[0]) for _ in positive_rated_document_list: for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] += field.stringValue() logger.info("Building query") query_builder = BooleanQuery.Builder() for score in scores: for field_name in user_fields.keys(): if field_name == 'content_id': continue field_parsers[field_name].setDefaultOperator( QueryParser.Operator.OR) field_query = field_parsers[field_name].escape( user_fields[field_name]) field_query = field_parsers[field_name].parse(field_query) field_query = BoostQuery(field_query, score) query_builder.add(field_query, BooleanClause.Occur.SHOULD) if candidate_list is not None: id_query_string = ' OR '.join("content_id:\"" + content_id + "\"" for content_id in candidate_list) id_query = QueryParser("testo_libero", KeywordAnalyzer()).parse(id_query_string) query_builder.add(id_query, BooleanClause.Occur.MUST) query = query_builder.build() docs_to_search = len(positive_rated_document_list) + recs_number scoreDocs = searcher.search(query, docs_to_search).scoreDocs logger.info("Building score frame to return") recorded_items = 0 columns = ['to_id', 'rating'] score_frame = pd.DataFrame(columns=columns) for scoreDoc in scoreDocs: if recorded_items >= recs_number: break if scoreDoc.doc not in positive_rated_document_list: doc = searcher.doc(scoreDoc.doc) item_id = doc.getField("content_id").stringValue() recorded_items += 1 score_frame = pd.concat([ score_frame, pd.DataFrame.from_records([(item_id, scoreDoc.score)], columns=columns) ]) return score_frame
class DocSimilarity(object): def __init__(self): # lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.boolean_query = BooleanQuery() self.similarityOfSynopsis() self.similarityOfStoryLine() def similarityOfSynopsis(self): directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.SYNOPSIS): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) while True: try: query = queryParser.parse(QueryParser.escape(content)) except Exception as e: self.boolean_query.setMaxClauseCount(self.boolean_query.maxClauseCount * 2) print self.boolean_query.maxClauseCount continue break topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first() similarity.synopsis = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id) def similarityOfStoryLine(self): directory = SimpleFSDirectory(File(settings.STORYLINE_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.STORYLINE): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) query = queryParser.parse(QueryParser.escape(content)) topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first() similarity.storyline = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id)
def __init__(self, DATA_DIR, vocab, n_threads, max_terms_per_doc, index_name, index_name_term, docs_path, docs_path_term, use_cache): self.n_threads = n_threads self.index_folder = DATA_DIR + '/data/' + index_name + '/' # folder to store lucene's index. It will be created in case it does not exist. self.index_folder_term = DATA_DIR + '/data/' + index_name_term + '/' # folder to store lucene's index. It will be created in case it does not exist. self.local_index_folder = './' + index_name self.local_index_folder_term = './' + index_name_term self.use_cache = use_cache self.docs_path = docs_path self.docs_path_term = docs_path_term self.max_terms_per_doc = max_terms_per_doc self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.vocab = vocab BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(self.index_folder): print 'Creating index at', self.index_folder if self.docs_path == self.docs_path_term: add_terms = True else: add_terms = False self.create_index(self.index_folder, self.docs_path, add_terms) if self.local_index_folder: print 'copying index from', self.index_folder, 'to', self.local_index_folder if os.path.exists(self.local_index_folder): print 'Folder', self.local_index_folder, 'already exists! Doing nothing.' else: shutil.copytree(self.index_folder, self.local_index_folder) self.index_folder = self.local_index_folder else: self.index_folder = self.index_folder fsDir = MMapDirectory(Paths.get(self.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) if self.docs_path != self.docs_path_term: if not os.path.exists(self.index_folder_term): print 'Creating index at', self.index_folder_term self.create_index(self.index_folder_term, self.docs_path_term, add_terms=True) if self.local_index_folder_term: print 'copying index from', self.index_folder_term, 'to', self.local_index_folder_term if os.path.exists(self.local_index_folder_term): print 'Folder', self.local_index_folder_term, 'already exists! Doing nothing.' else: shutil.copytree(self.index_folder_term, self.local_index_folder_term) self.index_folder_term = self.local_index_folder_term else: self.index_folder_term = self.index_folder_term fsDir_term = MMapDirectory(Paths.get(self.index_folder_term)) self.searcher_term = IndexSearcher( DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=self.n_threads) self.cache = {} print 'Loading Title-ID mapping...' self.title_id_map, self.id_title_map = self.get_title_id_map()
fsDir = SimpleFSDirectory(Paths.get('index')) searcher = IndexSearcher(DirectoryReader.open(fsDir)) if CLASSIC_SIMILARITY: searcher.setSimilarity(ClassicSimilarity()) analyzer = EnglishAnalyzer() tags_parser = QueryParser(TAGS_LABEL, analyzer) genres_parser = QueryParser(GENRES_LABEL, analyzer) descr_parser = QueryParser(DESCR_LABEL, analyzer) tags_parser.setDefaultOperator(QueryParser.Operator.OR) genres_parser.setDefaultOperator(QueryParser.Operator.OR) descr_parser.setDefaultOperator(QueryParser.Operator.OR) BooleanQuery.setMaxClauseCount( 2000000) # prevents 1024 limit error for very long queries ############################## Build user queries ########################## ratings = ML1M('../datasets/ml-1m').ratings movies_descriptions = pd.read_csv('../datasets/movies-descriptions.csv') movies_tags = pd.read_csv('../datasets/movies-tags.csv') movies_genres = pd.read_csv('../datasets/movies-genres.csv') users = set(ratings[['user']].values.flatten()) output_path = OUTPUT_FOLDER + OUTPUT_FILE_NAME + pick_tag() + '.csv' # write csv doc header f = open(output_path, 'w') f.write('user,item\n')