Ejemplo n.º 1
0
def retriever(file_dir):
    analyzer = WhitespaceAnalyzer()
    reader = DirectoryReader.open(
        SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")))
    searcher = IndexSearcher(reader)
    queryParser = QueryParser("code", analyzer)
    BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE)

    with open(file_dir + "/train/train.spl.src",
              'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu:
        sources = [line.strip() for line in fso.readlines()]
        summaries = [line.strip() for line in fsu.readlines()]
    with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \
            open(file_dir+"/output/ast.out", 'w') as fws:
        queries = [
            re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip())
            for line in ft.readlines()
        ]

        for i, line in enumerate(queries):
            print("query %d" % i)
            query = queryParser.parse(QueryParser.escape(line))
            hits = searcher.search(query, 1).scoreDocs
            flag = False

            for hit in hits:
                doc = searcher.doc(hit.doc)
                _id = eval(doc.get("id"))
                flag = True
                fwo.write(sources[_id] + '\n')
                fws.write(summaries[_id] + '\n')
            if not flag:
                print(query)
                print(hits)
                exit(-1)
Ejemplo n.º 2
0
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()
Ejemplo n.º 3
0
def run_aud(searcher, analyzer, path):
    print 1
    print "Searching for:", path
    p = resolve.voice()
    p.loaddata(path)
    p.fft()
    command = str(p.high_point[:-1])
    command = command[1:-1]
    command = '+'.join(command.split('('))
    command = '+'.join(command.split(')'))
    BooleanQuery.setMaxClauseCount(100000)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 50).scoreDocs

    geshou = []
    geming = []
    zhuanji = []
    liupai = []
    shijian = []
    jianjie = []
    geci = []
    imgurl = []

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        geming.append(doc.get("song_name"))
        geshou.append(doc.get('singer_name'))
        zhuanji.append(doc.get('album_name'))
        liupai.append(doc.get('style'))
        shijian.append(doc.get('time'))
        geci.append(doc.get('lyrics'))
        jianjie.append(doc.get('describe'))
        imgurl.append(doc.get("img"))
    print geci[0]
    print jianjie
    return geshou, geming, zhuanji, liupai, shijian, jianjie, geci, imgurl
Ejemplo n.º 4
0
 def __init__(self, indexDir,
              computeLengthNorm=True):  # 初始化 indexDir-索引文件目录 computeLengthNorm-是否应用SIM(true-不用 false-应用)
     #         if not jpype.isJVMStarted():
     #         lucene.initVM()
     lucene.getVMEnv().attachCurrentThread()
     self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)  # 标准分词 在针对英文时 以分隔符分词
     self.path = os.path.join(INDEX_PATH, indexDir)  # 存储路径
     self.store = SimpleFSDirectory(File(self.path))  # 存储***?
     # self.reader = DirectoryReader.open(self.store)
     self.reader = IndexReader.open(self.store)
     self.numDocs = self.reader.maxDoc()
     self.searcher = IndexSearcher(self.reader)  # IndexSearch类
     sim = CustomSimilarity()  # addby zmq
     if not computeLengthNorm:  # SIM
         sim = CustomSimilarity()
         self.searcher.setSimilarity(sim)
     self.mlt = MoreLikeThis(self.reader, sim)  # mlt?
     self.mlt.setAnalyzer(self.analyzer)
     self.mlt.setMinTermFreq(1)
     self.mlt.setMinDocFreq(1)
     # debug
     self.mlt.setMinWordLen(1)
     self.mlt.setMaxNumTokensParsed(100000000)
     BooleanQuery.setMaxClauseCount(1024 * 1024)  # 修改最长query clause BUG
Ejemplo n.º 5
0
 def __init__(self, lucene_vm, index_dir):
     lucene_vm.attachCurrentThread()
     BooleanQuery.setMaxClauseCount(2**16) # to avoid 'too many boolean clauses'
     
     self.index_dir = index_dir
             
     #self.fields = dict()
     #self.fields['doc_id'] = FieldType(StringField)
     #self.fields['doc_id'].setStored(True)
     #self.fields['doc_id'].setIndexOptions(IndexOptions.DOCS)
     
     #self.fields['content'] = FieldType(TextField)
     # self.fields['content'].setStored(True)
     #self.fields['content'].setIndexOptions(IndexOptions.DOCS_AND_FREQS)
     # self.fields['content'].setStoreTermVectors(True)
     
     self.directory = SimpleFSDirectory(Paths.get(self.index_dir))
     self.analyzer = WhitespaceAnalyzer()
     self.parser = QueryParser('content', self.analyzer)
 	# self.parser.setDefaultOperator(QueryParser.Operator.AND)
 	
     self.writer = None
     self.reader = None
     self.searcher = None
class DocSimilarity(object):
    def __init__(self):
        # lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.boolean_query = BooleanQuery()
        self.similarityOfSynopsis()
        self.similarityOfStoryLine()

    def similarityOfSynopsis(self):
        directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX))
        ireader = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS,
                                  analyzer)
        for root, dirnames, filenames in os.walk(settings.SYNOPSIS):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    while True:
                        try:
                            query = queryParser.parse(
                                QueryParser.escape(content))
                        except Exception as e:
                            self.boolean_query.setMaxClauseCount(
                                self.boolean_query.maxClauseCount * 2)
                            print self.boolean_query.maxClauseCount
                            continue
                        break

                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs
                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(
                                first_movie=major_movie,
                                second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(
                                    first_movie=minor_movie,
                                    second_movie=major_movie).first()
                            similarity.synopsis = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)

    def similarityOfStoryLine(self):
        directory = SimpleFSDirectory(File(settings.STORYLINE_INDEX))
        ireader = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS,
                                  analyzer)
        for root, dirnames, filenames in os.walk(settings.STORYLINE):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    query = queryParser.parse(QueryParser.escape(content))
                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs

                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(
                                first_movie=major_movie,
                                second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(
                                    first_movie=minor_movie,
                                    second_movie=major_movie).first()
                            similarity.storyline = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)
Ejemplo n.º 7
0
    def __recs_query(self, positive_rated_document_list, scores, recs_number,
                     items_directory, candidate_list: List) -> pd.DataFrame:
        """
        Builds a query using the contents that the user liked. The terms relative to the contents that
        the user liked are boosted by the rating he/she gave. A filter clause is added to the query to
        consider only candidate items
        Args:
            positive_rated_document_list: List of contents that the user liked
            scores: Ratings given by the user
            recs_number: How many items must be recommended. You can only specify the number, not
            a specific item for which compute the prediction
            items_directory: Directory where the items are stored

        Returns:
            score_frame (pd.DataFrame): DataFrame containing the recommendations for the user
        """
        BooleanQuery.setMaxClauseCount(2000000)
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(
                Paths.get(items_directory))))
        if self.__classic_similarity:
            searcher.setSimilarity(ClassicSimilarity())

        field_list = searcher.doc(positive_rated_document_list[0]).getFields()
        user_fields = {}
        field_parsers = {}
        analyzer = SimpleAnalyzer()
        for field in field_list:
            if field.name() == 'content_id':
                continue
            user_fields[field.name()] = field.stringValue()
            field_parsers[field.name()] = QueryParser(field.name(), analyzer)

        positive_rated_document_list.remove(positive_rated_document_list[0])

        for _ in positive_rated_document_list:
            for field in field_list:
                if field.name() == 'content_id':
                    continue
                user_fields[field.name()] += field.stringValue()

        logger.info("Building query")

        query_builder = BooleanQuery.Builder()
        for score in scores:
            for field_name in user_fields.keys():
                if field_name == 'content_id':
                    continue
                field_parsers[field_name].setDefaultOperator(
                    QueryParser.Operator.OR)

                field_query = field_parsers[field_name].escape(
                    user_fields[field_name])
                field_query = field_parsers[field_name].parse(field_query)
                field_query = BoostQuery(field_query, score)
                query_builder.add(field_query, BooleanClause.Occur.SHOULD)

        if candidate_list is not None:
            id_query_string = ' OR '.join("content_id:\"" + content_id + "\""
                                          for content_id in candidate_list)
            id_query = QueryParser("testo_libero",
                                   KeywordAnalyzer()).parse(id_query_string)
            query_builder.add(id_query, BooleanClause.Occur.MUST)

        query = query_builder.build()
        docs_to_search = len(positive_rated_document_list) + recs_number
        scoreDocs = searcher.search(query, docs_to_search).scoreDocs

        logger.info("Building score frame to return")

        recorded_items = 0
        columns = ['to_id', 'rating']
        score_frame = pd.DataFrame(columns=columns)
        for scoreDoc in scoreDocs:
            if recorded_items >= recs_number:
                break
            if scoreDoc.doc not in positive_rated_document_list:
                doc = searcher.doc(scoreDoc.doc)
                item_id = doc.getField("content_id").stringValue()
                recorded_items += 1

                score_frame = pd.concat([
                    score_frame,
                    pd.DataFrame.from_records([(item_id, scoreDoc.score)],
                                              columns=columns)
                ])

        return score_frame
class DocSimilarity(object):
    def __init__(self):
        # lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.boolean_query = BooleanQuery()
        self.similarityOfSynopsis()
        self.similarityOfStoryLine()

    def similarityOfSynopsis(self):
        directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX))
        ireader  = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)
        for root, dirnames, filenames in os.walk(settings.SYNOPSIS):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    while True:
                        try:
                            query = queryParser.parse(QueryParser.escape(content))
                        except Exception as e:
                            self.boolean_query.setMaxClauseCount(self.boolean_query.maxClauseCount * 2)
                            print self.boolean_query.maxClauseCount
                            continue
                        break

                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs
                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first()
                            similarity.synopsis = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)

    def similarityOfStoryLine(self):
        directory = SimpleFSDirectory(File(settings.STORYLINE_INDEX))
        ireader  = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)
        for root, dirnames, filenames in os.walk(settings.STORYLINE):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    query = queryParser.parse(QueryParser.escape(content))
                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs

                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first()
                            similarity.storyline = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)
Ejemplo n.º 9
0
    def __init__(self, DATA_DIR, vocab, n_threads, max_terms_per_doc,
                 index_name, index_name_term, docs_path, docs_path_term,
                 use_cache):
        self.n_threads = n_threads
        self.index_folder = DATA_DIR + '/data/' + index_name + '/'  # folder to store lucene's index. It will be created in case it does not exist.
        self.index_folder_term = DATA_DIR + '/data/' + index_name_term + '/'  # folder to store lucene's index. It will be created in case it does not exist.
        self.local_index_folder = './' + index_name
        self.local_index_folder_term = './' + index_name_term
        self.use_cache = use_cache
        self.docs_path = docs_path
        self.docs_path_term = docs_path_term
        self.max_terms_per_doc = max_terms_per_doc

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = vocab

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(self.index_folder):
            print 'Creating index at', self.index_folder
            if self.docs_path == self.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(self.index_folder, self.docs_path, add_terms)

        if self.local_index_folder:
            print 'copying index from', self.index_folder, 'to', self.local_index_folder
            if os.path.exists(self.local_index_folder):
                print 'Folder', self.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(self.index_folder, self.local_index_folder)
            self.index_folder = self.local_index_folder
        else:
            self.index_folder = self.index_folder

        fsDir = MMapDirectory(Paths.get(self.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if self.docs_path != self.docs_path_term:
            if not os.path.exists(self.index_folder_term):
                print 'Creating index at', self.index_folder_term
                self.create_index(self.index_folder_term,
                                  self.docs_path_term,
                                  add_terms=True)

            if self.local_index_folder_term:
                print 'copying index from', self.index_folder_term, 'to', self.local_index_folder_term
                if os.path.exists(self.local_index_folder_term):
                    print 'Folder', self.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(self.index_folder_term,
                                    self.local_index_folder_term)
                self.index_folder_term = self.local_index_folder_term
            else:
                self.index_folder_term = self.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(self.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=self.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()
Ejemplo n.º 10
0
fsDir = SimpleFSDirectory(Paths.get('index'))
searcher = IndexSearcher(DirectoryReader.open(fsDir))

if CLASSIC_SIMILARITY:
    searcher.setSimilarity(ClassicSimilarity())

analyzer = EnglishAnalyzer()
tags_parser = QueryParser(TAGS_LABEL, analyzer)
genres_parser = QueryParser(GENRES_LABEL, analyzer)
descr_parser = QueryParser(DESCR_LABEL, analyzer)

tags_parser.setDefaultOperator(QueryParser.Operator.OR)
genres_parser.setDefaultOperator(QueryParser.Operator.OR)
descr_parser.setDefaultOperator(QueryParser.Operator.OR)

BooleanQuery.setMaxClauseCount(
    2000000)  # prevents 1024 limit error for very long queries

############################## Build user queries ##########################
ratings = ML1M('../datasets/ml-1m').ratings

movies_descriptions = pd.read_csv('../datasets/movies-descriptions.csv')
movies_tags = pd.read_csv('../datasets/movies-tags.csv')
movies_genres = pd.read_csv('../datasets/movies-genres.csv')

users = set(ratings[['user']].values.flatten())

output_path = OUTPUT_FOLDER + OUTPUT_FILE_NAME + pick_tag() + '.csv'

# write csv doc header
f = open(output_path, 'w')
f.write('user,item\n')