def get_or_query(self, queries):
     """Creates an OR Boolean query from multiple Lucene queries """
     # empty boolean query with Similarity.coord() disabled
     bq = BooleanQuery(False)
     for q in queries:
         bq.add(q, BooleanClause.Occur.SHOULD)
     return bq
Esempio n. 2
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command 
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k,v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k,
                                analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------------'
            print 'title:',doc.get('title')
            print 'url:',doc.get('url')
            print 'src:',doc.get('src')
Esempio n. 3
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command #朱莉与茱莉娅

        # final = jieba.cut(command)
        # query = QueryParser(Version.LUCENE_CURRENT, "contents",
        #                     analyzer).parse(' '.join(final))
        
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k,v in command_dict.iteritems():            
            if(k=='site'):
                t = Term('url','*'+v.strip()+'*')
                query = WildcardQuery(t)
            else:
                query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------'
            #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site')
            print 'title:',doc.get('title'),
            print 'url:',doc.get('url')
Esempio n. 4
0
    def more_like_this(self, film, count=4):
        """
        Use query by document techniques to find related documents
        :param film: film
        :param count: number of results
        :return: a list of related films
        """
        # Retrieve doc id of the given film
        film_query = TermQuery(Term('id', str(film.film_id)))
        results = self.searcher.search(film_query, 1)
        if results.totalHits != 1:
            return []

        # Use MoreLikeThis query by document technology
        mlt = MoreLikeThis(reader)
        mlt.setFieldNames(["title", "director", "writer", "genre", "cast", "fullplot"])
        mlt.setMinTermFreq(0)
        mlt.setMinDocFreq(0)
        mlt.setAnalyzer(self.analyzer)
        mlt_query = mlt.like(results.scoreDocs[0].doc)

        # Filter the original film
        filtered_query = BooleanQuery()
        filtered_query.add(mlt_query, BooleanClause.Occur.MUST)
        filtered_query.add(film_query, BooleanClause.Occur.MUST_NOT)
        score_docs = self.searcher.search(filtered_query, count).scoreDocs

        return self._retrieve_in_order(score_docs)
Esempio n. 5
0
def do_mapping(line):
    regex = re.match(r"(?P<netflix_id>[0-9]+),(?P<year>([0-9]+)|NULL),(?P<title>.+)", line)
    if not regex:
        raise ValueError(line)
    netflix_id = int(regex.group("netflix_id"))

    title = QueryParser.escape(regex.group("title"))
    query1 = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(title)

    year = regex.group("year")
    if year == "NULL":
        scoreDocs = searcher.search(query1, 1).scoreDocs
    else:
        year = int(year)

        query2 = NumericRangeQuery.newIntRange("year", year, year, True, True)
        booleanQuery = BooleanQuery();
        booleanQuery.add(query1, BooleanClause.Occur.MUST);
        booleanQuery.add(query2, BooleanClause.Occur.MUST);

        scoreDocs = searcher.search(booleanQuery, 1).scoreDocs

    if scoreDocs:
        if scoreDocs[0].score > 1.5:
            doc = searcher.doc(scoreDocs[0].doc)
            doc_id = doc.getField("id").stringValue()
            doc.add(StringField("netflix_id", str(netflix_id), Field.Store.YES))
            writer.updateDocument(Term("id", doc_id), doc)
    def testFlat(self):

        q = BooleanQuery()
        q.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD))
        q.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD))
        q.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD))
        q.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD))
        self.assertEqual(1, self.search(q))
 def visitSCOPED_CLAUSE(self, node):
     clause = CqlVisitor.visitSCOPED_CLAUSE(self, node)
     if len(clause) == 1:
         return clause[0]
     lhs, operator, rhs = clause
     query = BooleanQuery()
     query.add(lhs, LHS_OCCUR[operator])
     query.add(rhs, RHS_OCCUR[operator])
     return query
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Esempio n. 9
0
 def testCollectScoresWithNoResultAndBooleanQueryDoesntFailOnFakeScorerInAggregateScoreCollector(self):
     q = BooleanQuery()
     q.add(luceneQueryFromCql('M=true'), BooleanClause.Occur.SHOULD)
     q.add(luceneQueryFromCql('M=true'), BooleanClause.Occur.SHOULD)
     q = ComposedQuery('coreA', query=q)
     q.start = 0
     q.stop = 0
     q.setRankQuery(core='coreC', query=luceneQueryFromCql('S=true'))
     q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX+'A'), dict(core='coreC', key=KEY_PREFIX+'C'))
     result = returnValueFromGenerator(self.dna.any.executeComposedQuery(q))
     self.assertEquals(4, result.total)
     self.assertEquals([], result.hits)
Esempio n. 10
0
def do_query(property, qstring, limit = 10):
    query = BooleanQuery()
    stream = analyzer.tokenStream(property, StringReader(qstring))
    stream.reset()
    attr = stream.getAttribute(CharTermAttribute)

    while stream.incrementToken():
        term = attr.toString()
        termQuery = TermQuery(Term(property, term))
        query.add(termQuery, Occur.SHOULD)

    hits = searcher.search(query, None, limit).scoreDocs
    return [Document(searcher.doc(hit.doc)) for hit in hits]
    def testOutOfOrderDocsScoringSort(self):
        """
        Two Sort criteria to instantiate the multi/single comparators.
        """

        sorts = [Sort(SortField.FIELD_DOC), Sort()]

        tfcOptions = [[False, False, False],
                      [False, False, True],
                      [False, True, False],
                      [False, True, True],
                      [True, False, False],
                      [True, False, True],
                      [True, True, False],
                      [True, True, True]]

        actualTFCClasses = [
            "OutOfOrderOneComparatorNonScoringCollector", 
            "OutOfOrderOneComparatorScoringMaxScoreCollector", 
            "OutOfOrderOneComparatorScoringNoMaxScoreCollector", 
            "OutOfOrderOneComparatorScoringMaxScoreCollector", 
            "OutOfOrderOneComparatorNonScoringCollector", 
            "OutOfOrderOneComparatorScoringMaxScoreCollector", 
            "OutOfOrderOneComparatorScoringNoMaxScoreCollector", 
            "OutOfOrderOneComparatorScoringMaxScoreCollector" 
        ]
    
        bq = BooleanQuery()

        # Add a Query with SHOULD, since bw.scorer() returns BooleanScorer2
        # which delegates to BS if there are no mandatory clauses.
        bq.add(MatchAllDocsQuery(), BooleanClause.Occur.SHOULD)

        # Set minNrShouldMatch to 1 so that BQ will not optimize rewrite to
        # return the clause instead of BQ.
        bq.setMinimumNumberShouldMatch(1)

        for sort in sorts:
            for tfcOption, actualTFCClass in izip(tfcOptions,
                                                  actualTFCClasses):
                tdc = TopFieldCollector.create(sort, 10, tfcOption[0],
                                               tfcOption[1], tfcOption[2],
                                               False)

                self.assert_(tdc.getClass().getName().endswith("$" + actualTFCClass))
          
                self.full.search(bq, tdc)
          
                tds = tdc.topDocs()
                sds = tds.scoreDocs  
                self.assertEqual(10, len(sds))
Esempio n. 12
0
    def _create_query(self, fields):
        """
        Build query with Term, Phrase and Fuzzy clauses.
        :param fields: dictionary of (field, text) tuples
        :return: query
        """
        query = BooleanQuery()
        for (field, text) in fields:
            if field.startswith("year"):
                start, end = text.split(",")
                numeric_query = NumericRangeQuery.newIntRange(
                    'year', int(start), int(end), True, True)
                query.add(BooleanClause(numeric_query, BooleanClause.Occur.MUST))
            if field == 'title':
                spans = []
                for word in text.lower().split():
                    spans.append(SpanTermQuery(Term(field, word)))
                query.add(BooleanClause(SpanNearQuery(spans, 2, True), BooleanClause.Occur.SHOULD))

        field_names, field_texts = zip(*fields)
        flags = [BooleanClause.Occur.MUST] * len(field_names)

        query_parser_query = MultiFieldQueryParser.parse(
            Version.LUCENE_CURRENT,
            field_texts,
            field_names,
            flags,
            StandardAnalyzer(Version.LUCENE_CURRENT))
        query.add(BooleanClause(query_parser_query, BooleanClause.Occur.MUST))

        fuzzify = lambda s: (s + " ").replace(" ", "~1 ")
        fuzzy_field_texts = map(fuzzify, field_texts)

        fuzzy_query_parser_query = MultiFieldQueryParser.parse(
            Version.LUCENE_CURRENT,
            fuzzy_field_texts,
            field_names,
            flags,
            StandardAnalyzer(Version.LUCENE_CURRENT))
        query.add(BooleanClause(fuzzy_query_parser_query, BooleanClause.Occur.MUST))

        boostQuery = FunctionQuery(
            LinearFloatFunction(
                PowFloatFunction(
                    DoubleConstValueSource(0.0001),
                    ScaleFloatFunction(IntFieldSource("imdb_votes_boost"), 0.0, 1.0)
                ), -1.0, 1.0))
        query = CustomScoreQuery(query, boostQuery)

        return query
    def testUnqualifiedTermFields(self):
        composer = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings())
        ast = parseCql("value")
        result = composer.compose(ast)
        query = BooleanQuery()
        left = TermQuery(Term("field0", "value"))
        left.setBoost(0.2)
        query.add(left, BooleanClause.Occur.SHOULD)

        right = TermQuery(Term("field1", "value"))
        right.setBoost(2.0)
        query.add(right, BooleanClause.Occur.SHOULD)

        self.assertEquals(type(query), type(result))
        self.assertEquals(repr(query), repr(result))
    def testParenthesisMust(self):

        q3 = BooleanQuery()
        q3.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD))
        q3.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD))
        q4 = BooleanQuery()
        q4.add(BooleanClause(self.c1, BooleanClause.Occur.MUST))
        q4.add(BooleanClause(self.c2, BooleanClause.Occur.MUST))
        q2 = BooleanQuery()
        q2.add(q3, BooleanClause.Occur.SHOULD)
        q2.add(q4, BooleanClause.Occur.SHOULD)
        self.assertEqual(1, self.search(q2))
def delete(primary_keys_map,collection_name,todelete,commit=False):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT

	try:
		tofind_keyvalue_pairs=json.loads(todelete)
	except:
		return 100	
	

	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)

	#setting writer configurations
	try:
		config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
		writer=IndexWriter(direc,config)
		ireader=IndexReader.open(direc)
	except:
		return 105

	###as of now deletion of documents support is only based on indexed keys.###################3 
	tofind_primary_keyvalue_pairs={}
	tofind_nonprimary_keyvalue_pairs={}

	#separating out primary and non_primary keys
	for key in tofind_keyvalue_pairs.keys():
		if key in primary_keys_map:
			tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]
		else:
			tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]

	#filtering documents according to primary keys		
	query=BooleanQuery()
	for key in tofind_primary_keyvalue_pairs.keys():
		temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key])
		query.add(BooleanClause(temp,BooleanClause.Occur.MUST))

	a=writer.deleteDocuments(query)
	if commit==True:
		writer.commit()
	writer.close()
	return 000;
    def testBraces(self):
        self.assertConversion(TermQuery(Term('unqualified', 'cats')), '(cats)')
        innerQuery = BooleanQuery()
        innerQuery.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.MUST)
        innerQuery.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.MUST)
        outerQuery = BooleanQuery()
        outerQuery.add(innerQuery, BooleanClause.Occur.SHOULD)
        outerQuery.add(TermQuery(Term('unqualified', 'mice')), BooleanClause.Occur.SHOULD)

        self.assertConversion(outerQuery, '(cats AND dogs) OR mice')
    def perform_search(self, searchterm, results_per_page, page):
        # if there is a field in the searchterm
        """if ":" in searchterm:
            # processing a query
            parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
            parser.setDefaultOperator(QueryParser.Operator.AND)

            query = parser.parse(searchterm)

        else:
            query = BooleanQuery()
            query_title = TermQuery(Term("title", searchterm))
            query_description = TermQuery(Term("description", searchterm))
            query_content = TermQuery(Term("content", searchterm))

            #  BooleanClause.Occur.MUST for AND queries
            query.add(query_title, BooleanClause.Occur.SHOULD)
            query.add(query_description, BooleanClause.Occur.SHOULD)
            query.add(query_content, BooleanClause.Occur.SHOULD)"""

        # create QueryParser for each field to be searched
        parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer)
        parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer)
        parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)

        # put fields together
        query = BooleanQuery()
        query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        hits = searcher.search(query, results_per_page + (results_per_page * page))
        score_docs = hits.scoreDocs
        count_results = hits.totalHits
        duration = datetime.now() - start

        # results to return
        results = []
        count = 0

        for scoreDoc in score_docs:

            # skip offset
            if count < results_per_page * page:
                count += 1
                continue
            count += 1


            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            results.append(table)

        return results, duration, count_results
	def rewrite(data_string):
		data=json.loads(data_string)
		toupdate=json.loads(update)
		#primary_key_modified=False

		#delete the appropriate document
		query=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		

		#modify the values
		for key,value in toupdate.items():
			#if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)	
			if add_field_if_not_exists==False:
				if key in data.keys():
					data[key]=value
			else:		
				data[key]=value

		#this deletion statement has been intenstionally added here		
		#only if the modified data,has primary keys already not existing,will the updating process continue
		primary_key_update=False
		for key in toupdate.keys():
			if key in primary_keys_map:
				primary_key_update=True
				break
		if primary_key_update == True:
			query_search=BooleanQuery()
			for key in primary_keys_map:
				temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
				query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST))
			hits=searcher.search(query_search,MAX_RESULTS).scoreDocs
			if len(hits) > 0:
				return 106			
		writer.deleteDocuments(query)

		#add the newly modified document
		doc=Document()
		#index files wrt primary key
		for primary_key in primary_keys_map:
			try:
				field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED)
				doc.add(field)
			except:
				# primary_keys_map.pop(collection_name)
				return 101
		#compress data using snappy if compression is on		
		if to_be_compressed_input==True:
			temp=json.dumps(data)
			data_string=base64.b64encode(snappy.compress(temp))
		else:
			temp=json.dumps(data)
			data_string=base64.b64encode(temp)

		field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED)
		doc.add(field)
		writer.addDocument(doc)
Esempio n. 19
0
def lucene_range_query_parse(query_string):
    '''parse the user's range query string into something pylucene can understand'''
    query = BooleanQuery()
    queries_ = query_string.split(snapconf.RANGE_QUERY_DELIMITER)
    start = None
    end = None
    start_inclusive = True
    end_inclusive = True
    for query_tuple in queries_:
        m=snapconf.RANGE_QUERY_FIELD_PATTERN.search(query_tuple)
        (col,op_,val)=re.split(snapconf.RANGE_QUERY_OPS,query_tuple)
        if not m or not col or col not in snapconf.TABIX_DBS or col not in snapconf.LUCENE_TYPES:
            continue
        op=m.group(1)
        if op not in snapconf.operators:
            sys.stderr.write("bad operator %s in range query,exiting\n" % (str(op)))
            sys.exit(-1)
        (ltype,ptype,qtype) = snapconf.LUCENE_TYPES[col]
        rquery = None
        if ptype == str:
            rquery = TermQuery(qtype(col,str(val)))
        else:
            #assume operator == '='
            (start,end) = (ptype(val),ptype(val)) 
            if op == '>=':
                end = None 
            if op == '<=':
                start = None 
            if op == '<':
                start = None
                end_inclusive = False
            if op == '>':
                end = None
                start_inclusive = False
            rquery = qtype(col,start,end,start_inclusive,end_inclusive)
        query.add(rquery,BooleanClause.Occur.MUST)
        #sys.stderr.write("query + fields: %s %s\n" % (query,field))
    return query
Esempio n. 20
0
    def extract_phrase_query(self, q, field, slop=0, boost=5):
        phrases = re.findall(r'"([^"]*)"', q)
        if len(phrases) == 0:
            return None, q

        q = re.sub(r'"([^"]*)"', "", q).strip()  # query without phrases
        if self.verbose:
            print "Detected phrases: ", phrases

        bq = BooleanQuery()
        for phrase in phrases:
            # pq = PhraseQuery()
            # for term in filter(None, phrase.split(' ')):
            #     pq.add(Term(field, term))
            qparser = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer)
            # parse phrase - this may or may not be desired
            # pq = qparser.parse(field + ':"' + phrase + '"')
            pq = qparser.parse('%s "%s"~%d^%.1f' %
                               (phrase, phrase, slop, boost))
            # phrase queries have high priority
            bq.add(pq, BooleanClause.Occur.MUST)
            # bq.add(pq, BooleanClause.Occur.SHOULD)

        return bq, q
    def visitSEARCH_CLAUSE(self, node):
        # possible children:
        # CQL_QUERY
        # SEARCH_TERM
        # INDEX, RELATION, SEARCH_TERM
        firstChild = node.children[0].name
        results = CqlVisitor.visitSEARCH_CLAUSE(self, node)
        if firstChild == 'SEARCH_TERM':
            (unqualifiedRhs,) = results
            if unqualifiedRhs == '*':
                return MatchAllDocsQuery()
            subQueries = []
            for fieldname, boost in self._unqualifiedTermFields:
                subQuery = self._termOrPhraseQuery(fieldname, unqualifiedRhs)
                if isinstance(subQuery, PhraseQuery) and not self._fieldRegistry.phraseQueryPossible(fieldname):
                    continue
                subQuery.setBoost(boost)
                subQueries.append(subQuery)
            if len(subQueries) == 1:
                query = subQueries[0]
            else:
                query = BooleanQuery()
                for subQuery in subQueries:
                    query.add(subQuery, BooleanClause.Occur.SHOULD)
            return query
        elif firstChild == 'INDEX':
            (left, (relation, boost), right) = results
            if relation in ['==', 'exact'] or (relation == '=' and self._fieldRegistry.isUntokenized(left)):
                query = TermQuery(self._createTerm(left, right))
            elif relation == '=':
                query = self._termOrPhraseQuery(left, right)
            elif relation in ['<','<=','>=','>']:
                query = self._termRangeQuery(left, relation, right)
            else:
                raise UnsupportedCQL("'%s' not supported for the field '%s'" % (relation, left))

            query.setBoost(boost)
            return query
        else:
            ((query,),) = results
            return query
Esempio n. 22
0
def run(searcher_good, searcher_bad, analyzer):
    while True:
        command_dict = parseCommand(command)
        total_num = 20

        #这些不同的s用来决定排序顺序:依次是按价格(从低到高)、热度(总评论数)、好评率、综合评分
        #s=SortField("price",SortField.Type.FLOAT,False)
        #s=SortField("total_comment",SortField.Type.FLOAT,True)
        s = SortField("good_rate", SortField.Type.FLOAT, True)
        #s=SortField("socre",SortField.Type.FLOAT,True)
        so = Sort(s)

        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        #这两句用来限定价格的范围
#q=NumericRangeQuery.newFloatRange("price",100.0,200.0,True,True)
#querys.add(q,BooleanClause.Occur.MUST)

        scoreDocs_good = searcher_good.search(querys, total_num, so).scoreDocs
        total = len(scoreDocs_good)
        flag = True
        if len(scoreDocs_good) < total_num:
            scoreDocs_bad = searcher_bad.search(querys, total_num,
                                                so).scoreDocs
            total = total + len(scoreDocs_bad)
            flag = False
        if total > total_num:
            total = total_num
        print "%s total matching documents." % total

        #"url"是网址,“img_url”是图片网址,“brand”是品牌
        for scoreDoc_good in scoreDocs_good:
            doc = searcher_good.doc(scoreDoc_good.doc)
            ##            explanation = searcher.explain(query, scoreDoc.doc)
            print "------------------------"
            print 'title:', doc.get('title')
            print 'total_comment', doc.get("total_comment")
            print 'price', doc.get("price")
            print 'socre', doc.get("socre")
            print 'brand', doc.get("brand")
            print 'good_rate', doc.get("good_rate")
            print
        if not flag:
            t = 0
            for scoreDoc_bad in scoreDocs_bad:
                t = t + 1
                doc = searcher_bad.doc(scoreDoc_bad.doc)
                ##                explanation = searcher.explain(query, scoreDoc.doc)
                print "------------------------"
                print 'title:', doc.get('title')
                print 'total_comment', doc.get("total_comment")
                print 'price', doc.get("price")
                print 'score', doc.get("score")
                print 'brand', doc.get("brand")
                print 'good_rate', doc.get("good_rate")
                print
                if t > total_num - 1 - len(scoreDocs_good):
                    break
Esempio n. 23
0
 def addDuplicatesQuery(self, query):
     not_duplicate = TermQuery(Term('duplicate', 'false'))
     booleanQuery = BooleanQuery()
     booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
     booleanQuery.add(query, BooleanClause.Occur.MUST)
     return booleanQuery
def  search(primary_keys_map,to_be_compressed_input,collection_name,tofind,MAX_RESULTS=1000):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT
	try:
		print "********" + tofind
		tofind_keyvalue_pairs=json.loads(tofind)
	except:
		return 100	
	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)
	try:
		ireader=IndexReader.open(direc)	
		searcher=IndexSearcher(ireader)
	except:
		return 105

	#initializing return list 
	return_list=[]
	#check_list=[]
	tofind_primary_keyvalue_pairs={}
	tofind_nonprimary_keyvalue_pairs={}

	#separating out primary and non_primary keys
	for key in tofind_keyvalue_pairs.keys():
		if key in primary_keys_map:
			tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]
		else:
			tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]

	#filtering documents		
	if len(tofind_primary_keyvalue_pairs)>0:		
		query=BooleanQuery()
		for key in tofind_primary_keyvalue_pairs.keys():
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		hits=searcher.search(query,MAX_RESULTS).scoreDocs
		for hit in hits:
			doc=searcher.doc(hit.doc)
			if to_be_compressed_input==True:
				data=snappy.uncompress(doc.get("$DATA$"))
			else:
				data=doc.get("$DATA$")
			#non primary key filtering(without having to load all the primary key filtered values into main memory!)	
			if len(tofind_nonprimary_keyvalue_pairs)>0:
				entry=json.loads(data)
				satisfied=True
				for key in tofind_nonprimary_keyvalue_pairs.keys():
					if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]:
						satisfied=False
						break
				if satisfied==True:
					return_list.append(data)
			else:
				return_list.append(data)
			
	else:
		for i in range(0,ireader.numDocs()):
			doc=searcher.doc(i)
			if to_be_compressed_input==True:
				data=snappy.uncompress(str(doc.get("$DATA$")))
			else:
				data=doc.get("$DATA$")

				
			#non primary key filtering(without having to load all the primary key filtered values into main memory!)	
			if len(tofind_nonprimary_keyvalue_pairs)>0:
				entry=json.loads(data)
				satisfied=True
				for key in tofind_nonprimary_keyvalue_pairs.keys():
					if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]:
						satisfied=False
						break
				if satisfied==True:
					return_list.append(data)
			else:
				return_list.append(data)
			
	ireader.close()

	if len(return_list)==0:
		return None	
	else:
		return return_list 
Esempio n. 25
0
    def ancientSearch(self, field):
        sear = self._search
        fieldOnly = False
        # 只搜索域
        if len(self._commandInfo.getWordList()) == 0:
            fieldOnly = True
            bq = BooleanQuery.Builder()
            fields = self._commandInfo.getFields()
            for key in fields:
                queryx = QueryParser(key, KeywordAnalyzer()).parse(fields[key][0])
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()

        elif len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']:
            bq = BooleanQuery.Builder()
            q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
            bc = BooleanClause(q, BooleanClause.Occur.MUST)
            bq.add(bc)
            for i in self._commandInfo.getFields():
                if i == 'section' or i == 'document':
                    continue
                queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i]))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()
        elif self._commandInfo.getKey()[0] == '#':
            bq = BooleanQuery.Builder()
            query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
            query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1]))
            bc1 = BooleanClause(query1, BooleanClause.Occur.MUST)
            bc2 = BooleanClause(query2, BooleanClause.Occur.MUST)
            bq.add(bc1).add(bc2)
            for i in self._commandInfo.getFields():
                if i == 'section' or i == 'document':
                    continue
                queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i]))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()
        elif self._commandInfo.getKey()[0] in ['$', '+']:
            bq = BooleanQuery.Builder()
            for w in self._commandInfo.getWordList():
                queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            for i in self._commandInfo.getFields():
                if i == 'section' or i == 'document':
                    continue
                queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i]))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()
        else:
            query = ''
        hits = sear.search(query, 9999)
        for hit in hits.scoreDocs:
            doc = sear.doc(hit.doc)
            res = doc.get(field)
            id = doc.get('id')
            detail = get_detail(doc)
            zhujie = detail['zhujie']
            if detail['detail'] and 'detail' in detail['detail'].keys():
                detail['detail'] = detail['detail']['detail']
            detail.pop('zhujie')
            detail.pop('text')
            detail.pop('type')
            detail = json.dumps(detail)
            if fieldOnly:
                if not doc.get("text").strip():
                    continue
                if id.count(".") == 2:
                    self._doc[id] = doc.get("text")
                    self._resultSentencesList.append((id, doc.get("text")))
                elif id.count(".") == 1:
                    searcher = self._search
                    query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1')
                    hits = searcher.search(query, 1)

                    for hit in hits.scoreDocs:
                        doc = searcher.doc(hit.doc)
                        res = doc.get("text")
                        if res:
                            self._doc[id+".1"] = doc.get('text')
                            self._resultSentencesList.append((id + ".1", doc.get('text')))
                else:
                    searcher = self._search
                    query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1.1')
                    hits = searcher.search(query, 1)
                    for hit in hits.scoreDocs:
                        doc = searcher.doc(hit.doc)
                        res = doc.get("text")
                        if not doc.get("text").strip():
                            continue
                        if res:
                            self._doc[id+".1.1"] = doc.get('text')
                            self._resultSentencesList.append((id + ".1.1", doc.get('text')))
            elif doc_hit(res, self._commandInfo):
                if key_filter(self._commandInfo, res):
                    if 'section' in self._commandInfo.getFields().keys():
                        if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['section'], 0):
                            continue
                    if 'document' in self._commandInfo.getFields().keys():
                        if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['document'], 1):
                            continue
                    self._doc[id] = res
                    self._resultSentencesList.append((id, res, detail, zhujie))
        return self
def update(primary_keys_map,to_be_compressed_input,collection_name,tofind,update,commit=False,add_field_if_not_exists=True):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	#As of now the update will be implemented as search,modify data in json file,delete and re-write
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT
	try:
		tofind_keyvalue_pairs=json.loads(tofind)
	except:
		return 100	
	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)
	try:
		ireader=IndexReader.open(direc)	
		searcher=IndexSearcher(ireader)
		#setting writer configurations
		config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
		writer=IndexWriter(direc,config)
	except:
		return 105
	no_of_documents_modified=0	
	#finding the document to update
	#Scope for making this more efficient
	def rewrite(data_string):
		data=json.loads(data_string)
		toupdate=json.loads(update)
		#primary_key_modified=False

		#delete the appropriate document
		query=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		

		#modify the values
		for key,value in toupdate.items():
			#if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)	
			if add_field_if_not_exists==False:
				if key in data.keys():
					data[key]=value
			else:		
				data[key]=value

		#this deletion statement has been intenstionally added here		
		#only if the modified data,has primary keys already not existing,will the updating process continue
		query_search=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
			query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		hits=searcher.search(query_search,MAX_RESULTS).scoreDocs
		if len(hits) > 0:
			return 106			
		writer.deleteDocuments(query)

		#add the newly modified document
		doc=Document()
		#index files wrt primary key
		for primary_key in primary_keys_map:
			try:
				field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED)
				doc.add(field)
			except:
				# primary_keys_map.pop(collection_name)
				return 101
		#compress data using snappy if compression is on		
		if to_be_compressed_input==True:
			data_string=snappy.compress(str(json.dumps(data)))
		else:
			data_string=json.dumps(data)	
		field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED)
		doc.add(field)
		writer.addDocument(doc)

	tofind_primary_keyvalue_pairs={}
	tofind_nonprimary_keyvalue_pairs={}

	#separating out primary and non_primary keys
	for key in tofind_keyvalue_pairs.keys():
		if key in primary_keys_map:
			tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]
		else:
			tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key]

	#filtering documents		
	if len(tofind_primary_keyvalue_pairs)>0:		
		query=BooleanQuery()
		for key in tofind_primary_keyvalue_pairs.keys():
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		hits=searcher.search(query,MAX_RESULTS).scoreDocs
		
		for hit in hits:
			doc=searcher.doc(hit.doc)
			if to_be_compressed_input==True:
				data=snappy.uncompress(doc.get("$DATA$"))
			else:
				data=doc.get("$DATA$")
			#non primary key filtering(without having to load all the primary key filtered values into main memory!)	
			if len(tofind_nonprimary_keyvalue_pairs)>0:
				entry=json.loads(data)
				satisfied=True
				for key in tofind_nonprimary_keyvalue_pairs.keys():
					if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]:
						satisfied=False
						break
				if satisfied==True:
					if rewrite(data)!=106:
						no_of_documents_modified+=1
					else:
						writer.rollback()
						return 106	
			else:
				if rewrite(data)!=106:
					no_of_documents_modified+=1
				else:
					writer.rollback()
					return 106
				
			
	else:
		for i in range(0,ireader.numDocs()):
			doc=searcher.doc(i)
			if to_be_compressed_input==True:
				data=snappy.uncompress(doc.get("$DATA$"))
			else:
				data=doc.get("$DATA$")
			#non primary key filtering(without having to load all the primary key filtered values into main memory!)	
			if len(tofind_nonprimary_keyvalue_pairs)>0:
				entry=json.loads(data)
				satisfied=True
				for key in tofind_nonprimary_keyvalue_pairs.keys():
					if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]:
						satisfied=False
						break
				if satisfied==True:
					if rewrite(data)!=106:
						no_of_documents_modified+=1
					else:
						writer.rollback()
						return 106
			else:
				if rewrite(data)!=106:
					no_of_documents_modified+=1
				else:
					writer.rollback()
					return 106
			
	
	ireader.close()
	if commit==True:
			writer.commit()
	writer.close()
	return str(no_of_documents_modified)+" have been modified"
def store(primary_keys_map,to_be_compressed_input,collection_name,data,commit=False):
	INDEX_DIR_DEFAULT="IndexFiles.index"
	if collection_name!="DEFAULT":
		INDEX_DIR=collection_name
	else:
		INDEX_DIR=INDEX_DIR_DEFAULT	
	print "started indexing input data......"
	
	#extracting values
	try:
		contents=json.loads(data)
	except:
		return 100


	direc=SimpleFSDirectory(File(INDEX_DIR))
	analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)
	

	#checking for existance of record with same primary_key set
	try:
		ireader=IndexReader.open(direc)	
		searcher=IndexSearcher(ireader)
		query=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(contents[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		hits=searcher.search(query,MAX_RESULTS).scoreDocs
		if len(hits) > 0:
			return 106
	except:
		pass 	 
	
	


	#setting writer configurations
	config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
	config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
	writer=IndexWriter(direc,config)
	#fix this later.....FieldType not defined
	#field_type=FieldType()
	#field_type.setIndexed(True)
	#field_type.setStored(False)
	#field_type.setTokenized(False)
	
	try:
		doc=Document()
		#index files wrt primary key
		for primary_key in primary_keys_map:
			try:
				field=Field(primary_key,contents[primary_key],Field.Store.NO,Field.Index.ANALYZED)
				doc.add(field)
			except:
				# primary_keys_map.pop(collection_name)
				return 101
		#compress data using snappy if compression is on		
		if to_be_compressed_input==True:
			data=snappy.compress(data)
		field=Field("$DATA$",data,Field.Store.YES,Field.Index.ANALYZED)
		doc.add(field)
		writer.addDocument(doc)
		if commit==True:
			writer.commit()
		writer.close()
		return 000
	except:
		return 102
Esempio n. 28
0
def runstext(command, cpage, meth):

    global vm_env, searcher, analyzer
    text = []
    print(command)
    if command == '':
        return

    command_dict = parseCommand(command)
    querys = BooleanQuery()

    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys, 1000).scoreDocs
    maxnum = len(scoreDocs)

    keywords = QueryParser(Version.LUCENE_CURRENT, "contents",
                           analyzer).parse(command_dict['contents'])
    reslist = []
    maxnum = min(maxnum, 100)
    for i, scoreDoc in enumerate(scoreDocs[:maxnum]):
        doc = searcher.doc(scoreDoc.doc)
        date = doc.get("date")
        score = float(scoreDoc.score)
        reslist.append([doc, date, score])

    style = highlight.SimpleHTMLFormatter("<b><font color=\'red\'>",
                                          "</font></b>")
    high_seg = highlight.Highlighter(style, highlight.QueryScorer(keywords))
    high_seg.setTextFragmenter(highlight.SimpleFragmenter(50))

    if meth == "rel":
        reslist = sorted(reslist, key=lambda res: res[2], reverse=True)
    elif meth == "td":
        reslist = sorted(reslist, key=lambda res: res[1], reverse=True)
    elif meth == "tu":
        reslist = sorted(reslist, key=lambda res: res[1], reverse=False)
    print keywords
    start = (cpage - 1) * 10
    end = min(start + 10, maxnum)
    print start, end
    for i in reslist[start:end]:
        doc = i[0]
        score = i[2]
        date = str(getdate(i[1]))
        text_dic = {}
        text_dic['title'] = doc.get("title").strip('-直播吧zhibo8.cc').strip(
            '_新浪竞技风暴_新浪网')
        text_dic['url'] = doc.get("url")
        tmpcontent = cleantxt(doc.get("contents"))
        keyword = high_seg.getBestFragment(analyzer, "contents", tmpcontent)
        text_dic['keyword'] = keyword
        text_dic['score'] = score
        text_dic['date'] = date
        text.append(text_dic)
    '''for i, scoreDoc in enumerate(scoreDocs):
        text_dic = {}
        doc = searcher.doc(scoreDoc.doc)

        text_dic['title'] = doc.get("title")
        text_dic['url'] = doc.get("url")
        keyword = high_seg.getBestFragment(analyzer, "contents", cleantxt(doc.get('contents')))
        text_dic['keyword'] = keyword
        text.append(text_dic)'''

    return text, maxnum
 def testBooleanNotTermOutput(self):
     query = BooleanQuery()
     query.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.MUST)
     query.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.MUST_NOT)
     self.assertConversion(query, 'cats NOT dogs')
print("Index contains %d documents." % n_docs)

def get_query_results(reader,query,n,field):
    searcher = IndexSearcher(reader)
    hits = searcher.search(query, n).scoreDocs
    print("Found %d hits:" % len(hits))
    for i, hit in enumerate(hits):
        doc = searcher.doc(hit.doc)
        print("%d. %s" % (i + 1, doc.get(field)))

#### part(a)
query1a = TermQuery(Term("capital_html","greek"))
query2a = TermQuery(Term("capital_html","roman"))
query3a = TermQuery(Term("capital_html","persian"))

boolean_query_a = BooleanQuery()
boolean_query_a.add(query1a, BooleanClause.Occur.MUST)
boolean_query_a.add(query2a, BooleanClause.Occur.MUST)
boolean_query_a.add(query3a, BooleanClause.Occur.MUST_NOT)

get_query_results(reader,boolean_query_a,n_docs,"capital")

#Found 32 hits:
#1. https://en.wikipedia.org/wiki/Sukhumi
#2. https://en.wikipedia.org/wiki/Nicosia
#3. https://en.wikipedia.org/wiki/Nicosia
#4. https://en.wikipedia.org/wiki/Tiraspol
#5. https://en.wikipedia.org/wiki/Tripoli
#6. https://en.wikipedia.org/wiki/Tunis
#7. https://en.wikipedia.org/wiki/Lisbon
#8. https://en.wikipedia.org/wiki/Podgorica
    def testEquality(self):

        bq1 = BooleanQuery()
        bq1.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD)
        bq1.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD)

        nested1 = BooleanQuery()
        nested1.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD)
        nested1.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD)
        bq1.add(nested1, BooleanClause.Occur.SHOULD)

        bq2 = BooleanQuery()
        bq2.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD)
        bq2.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD)

        nested2 = BooleanQuery()
        nested2.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD)
        nested2.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD)
        bq2.add(nested2, BooleanClause.Occur.SHOULD)
        
        self.assert_(bq1.equals(bq2))
 def __init__(self):
     # lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     self.boolean_query = BooleanQuery()
     self.similarityOfSynopsis()
     self.similarityOfStoryLine()
class DocSimilarity(object):
    def __init__(self):
        # lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.boolean_query = BooleanQuery()
        self.similarityOfSynopsis()
        self.similarityOfStoryLine()

    def similarityOfSynopsis(self):
        directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX))
        ireader  = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)
        for root, dirnames, filenames in os.walk(settings.SYNOPSIS):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    while True:
                        try:
                            query = queryParser.parse(QueryParser.escape(content))
                        except Exception as e:
                            self.boolean_query.setMaxClauseCount(self.boolean_query.maxClauseCount * 2)
                            print self.boolean_query.maxClauseCount
                            continue
                        break

                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs
                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first()
                            similarity.synopsis = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)

    def similarityOfStoryLine(self):
        directory = SimpleFSDirectory(File(settings.STORYLINE_INDEX))
        ireader  = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)
        for root, dirnames, filenames in os.walk(settings.STORYLINE):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    query = queryParser.parse(QueryParser.escape(content))
                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs

                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first()
                            similarity.storyline = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)