def index_single_file(self, doc_file): logger.info("adding {}".format(doc_file)) lucene_doc_num = 0 try: with open(doc_file) as df: for line in df: wiki_doc = json.loads(line) doc_title = wiki_doc['title'] doc_text = wiki_doc['plaintext'] doc_id = wiki_doc['_id'] paragraphs = doc_text.split('\n\n') if len(paragraphs) < 3: continue doc_text = rm_special_chars(doc_text) doc = Document() doc.add(StringField("id", str(doc_id), Field.Store.YES)) doc.add(TextField("title", doc_title, Field.Store.YES)) doc.add(TextField("text", doc_text, Field.Store.YES)) self.writer.addDocument(doc) lucene_doc_num += 1 if lucene_doc_num % 10000 == 0: logger.info('added {} lucene docs'.format(lucene_doc_num)) except Exception as e: import traceback traceback.print_tb(e.__traceback__) logger.error("Failed in: {}".format(doc_file)) return lucene_doc_num
def get_document(self, title, body, tags, date): doc = Document() doc.add(TextField(self.INDEX_FILED_TITLE, title, Field.Store.YES)) doc.add(TextField(self.INDEX_FILED_BODY, body, Field.Store.YES)) doc.add(TextField(self.INDEX_FILED_TAGS, tags, Field.Store.YES)) doc.add(TextField(self.INDEX_FILED_DATE, date, Field.Store.YES)) return doc
def indexDocs(self, root, writer): for root,dirnames,filenames in os.walk(root): for dirname in dirnames: #遍历文件夹 path1 = os.path.join(root,dirname) for trivial1 , trivial2 , filenames in os.walk(path1): #遍历文件夹下的文件 for filename in filenames: #print(root,dirnames,filename) print("adding", filename) # try: path = os.path.join(path1, filename) file = open(path, encoding='utf8') page = file.readline() title = file.readline() contents = file.read() file.close() # jieba 分词 seg_contents = jieba.lcut_for_search(contents) contents = ' '.join(seg_contents) url = page seg_url = jieba.lcut_for_search(page) page = ' '.join(list(set(seg_url)-set(['.','http','https','/',':','?','=','html','shtml','www']))) doc = Document() doc.add(StringField("name", filename, Field.Store.YES)) doc.add(StringField("path", path, Field.Store.YES)) if len(contents) > 0: doc.add(TextField('title', title, Field.Store.YES)) doc.add(TextField('site', page, Field.Store.YES)) doc.add(TextField('url',url,Field.Store.YES)) doc.add(TextField('contents', contents, Field.Store.YES)) else: print("warning: no content in %s" % filename) writer.addDocument(doc)
def index_one(self, article): """Create index for one url object in the database. """ try: date_published_str = article['date_published'].strftime( self.date_format) except Exception as e: logger.warning('Error when formating date_published %r: %s ', article['canonical_url'], e) return doc = Document() doc.add(StoredField('group_id', article['group_id'])) doc.add(StoredField('article_id', article['article_id'])) doc.add( StringField('date_published', date_published_str, Field.Store.YES)) doc.add( SortedDocValuesField('date_published', BytesRef(date_published_str))) doc.add(StoredField('date_published', date_published_str)) doc.add(StringField('domain', article['domain'], Field.Store.YES)) doc.add(StringField('site_type', article['site_type'], Field.Store.YES)) doc.add( TextField('canonical_url', article['canonical_url'], Field.Store.YES)) doc.add(TextField('title', article['title'], Field.Store.YES)) doc.add(TextField('meta', article['meta'], Field.Store.NO)) doc.add(TextField('content', article['content'], Field.Store.NO)) doc.add(StoredField('uq_id_str', article['uq_id_str'])) self.writer.addDocument(doc)
def getDoc(self, file): try: f = open(os.getcwd()+FILE_DIR+'/'+file, "r") try: c = [] s = BeautifulSoup(f, 'html.parser') text = s.findAll(text=True) c = filter(tag_vis, text) try: c = ' '.join(c) except Exception as e: c = b' '.join(c) except Exception as e: print(str(e)) return content = TextField("contents", c, Field.Store.YES) fileName = str(Paths.get(file)).split('/')[-1] fileName = fileName[:fileName.find(".")] filename = TextField("filename", fileName, Field.Store.YES) path = TextField("filepath", str(os.getcwd()+FILE_DIR+'/'+file), Field.Store.NO) doc = Document() doc.add(content) doc.add(filename) doc.add(path) return doc except Exception as e: print(type(Exception).__name__) print(str(e)) return
def get_doc(self, filename, path, title, url, contents): ''' Generate a `Document` according to the parameters. Input: `filename`: filename of the webpage `path`: path of the webpage `title`: title of the webpage `url`: original url of the webpage `contents`: contents of the webpage Output: `Document` with the fields initialized ''' doc = Document() # doc.add(Field("name", filename, self.property_type)) # doc.add(Field("path", path, self.property_type)) # doc.add(Field("title", title, self.property_type)) # doc.add(Field("url", url, self.property_type)) doc.add(StringField("name", filename, Field.Store.YES)) doc.add(StringField("path", path, Field.Store.YES)) doc.add(TextField("title", title, Field.Store.YES)) doc.add(TextField("url", url, Field.Store.YES)) if len(contents) > 0: # doc.add(Field("contents", contents, self.content_type)) doc.add(TextField("contents", contents, Field.Store.YES)) else: print("Warning: No content in {}".format(filename)) return doc
def index (cls, indexDir, taxoDir): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer from org.apache.lucene.util import Version config = IndexWriterConfig(Version.LUCENE_42, WhitespaceAnalyzer(Version.LUCENE_42)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # FacetFields is a utility class for adding facet fields to a document: facet_fields = FacetFields(taxo) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # obtain the sample facets for current document facets = categories[docNum] facetList = [CategoryPath(f) for f in facets] # NOTE: setCategoryPaths() requires an Iterable, so need to convert the # Python list in order to to pass a proper argument to setCategoryPaths. # We use java.util.Arrays (via JCC) to create a Java List: facetList = Arrays.asList(facetList) # NOTE: we could use lucene.collections here as well in order to convert our # Python list to a Java based list using the JavaList class (JavaList implements # java.util.List around a Python list instance it wraps): # from lucene.collections import JavaList # facetList = JavaList(facetList) # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # use the FacetFields utility class for adding facet fields (i.e. the categories) # to the document (and, as required, to the taxonomy index) facet_fields.addFields(doc, facetList) # finally add the document to the index iw.addDocument(doc) nDocsAdded +=1 nFacetsAdded += facetList.size() # end for # commit changes. # we commit changes to the taxonomy index prior to committing them to the search index. # this is important, so that all facets referred to by documents in the search index # will indeed exist in the taxonomy index. taxo.commit() iw.commit() # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. taxo.close() iw.close() print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)
def index(self, eid, name, cname, type, info): doc = Document() doc.add(TextField('id', eid, Field.Store.YES)) doc.add(TextField('name', name, Field.Store.YES)) doc.add(TextField('CannonicalName', cname, Field.Store.YES)) doc.add(TextField('type', type, Field.Store.YES)) doc.add(TextField('info', info, Field.Store.YES)) self.writer.addDocument(doc)
def create_document_by_document_sentence(org_title, preprocessed_title, doc_id, sentence): doc = Document() # create a new document doc.add(StringField("org_title", org_title, Field.Store.YES)) doc.add( TextField("preprocessed_title", preprocessed_title, Field.Store.YES)) doc.add(StringField("doc_id", str(doc_id), Field.Store.YES)) # doc.add(StringField("content", content, Field.Store.YES)) doc.add(TextField("sentence", sentence, Field.Store.YES)) return doc
def index_files(): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() BASE_DIR = path.dirname(path.abspath(sys.argv[0])) INPUT_DIR = BASE_DIR + "/input/" INDEX_DIR = BASE_DIR + "/lucene_index/" NoT = 100000 # Number of Tokens print "------------------------------------------------------" print "PyLucene Demo started (lucene_demo.py)" print "Python version: %d.%d.%d" % ( sys.version_info.major, sys.version_info.minor, sys.version_info.micro) print 'Lucene version:', lucene.VERSION print "------------------------------------------------------\n" # lucene.initVM() # directory = RAMDirectory() index_path = Paths.get(INDEX_DIR) directory = SimpleFSDirectory(index_path) analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, NoT) config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) print "Number of indexed documents: %d\n" % writer.numDocs() for input_file in listdir(INPUT_DIR): # iterate over all input files print "Current file:", input_file if input_file.endswith(".json"): with open(INPUT_DIR + input_file) as f: for line in f: # doc = create_document(line, input_file) # call the create_document function o = json.loads(line) doc = Document() # create a new document doc.add(TextField("filename", input_file, Field.Store.YES)) # print file doc.add( TextField("username", o['user']['screen_name'], Field.Store.YES)) # print "username: "******"text", o['text'], Field.Store.YES)) # print "text: " + o['text'] if o['user']['location']: doc.add( TextField("location", o['user']['location'], Field.Store.YES)) # print "location: " + o['user']['location'] doc.add(TextField("time", o['created_at'], Field.Store.YES)) writer.addDocument( doc) # add the document to the IndexWriter print "\nNumber of indexed documents: %d" % writer.numDocs() writer.close() print "Finished\n" print "-----------------------------------------------------"
def create_document_by_document_content(org_title, preprocessed_title, preprocessed_title_lower, content): doc = Document() # create a new document doc.add(StringField("org_title", org_title, Field.Store.YES)) doc.add( TextField("preprocessed_title", preprocessed_title, Field.Store.YES)) doc.add( StringField("preprocessed_title_lower", preprocessed_title_lower, Field.Store.YES)) # doc.add(StringField("content", content, Field.Store.YES)) doc.add(TextField("content", content, Field.Store.YES)) return doc
def createDocument_tweet(data): jsonText = data['text'] #accesses tweet jsonName = data['user']['screen_name'] #accesses username jsonLocation = data['coordinates'] doc = Document() #added fields doc.add(TextField("tweet", jsonText, Field.Store.YES)) doc.add(TextField("u_name", jsonName, Field.Store.YES)) # doc.add(TextField("date", jsonDate, Field.Store.YES)) # print jsonText return doc
def testAdd(self, goodname, salenum, price, shopname, url, picturename, comment, historyprice): analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(self.dir, config) # True,建立新索引,False,建立增量索引 noIndexedString = FieldType() noIndexedString.setTokenized(False) noIndexedString.setIndexed(False) noIndexedString.setStored(True) try: print "adding", goodname goodname_s = unicode(goodname, 'utf8') seg_list_good = jieba.cut(goodname_s, cut_all=False) goodname_s = " ".join(seg_list_good) # 默认模式 shopname_s = unicode(shopname, 'utf8') seg_list_shop = jieba.cut(shopname_s, cut_all=False) shopname_s = " ".join(seg_list_shop) # 默认模式 shopnameField = Field("shopName", shopname, noIndexedString) shopnameField_s = TextField("shopName_s", shopname_s, Field.Store.NO) goodnameField = Field("goodName", goodname, noIndexedString) goodnameField_s = TextField("goodName_s", goodname_s, Field.Store.NO) salenumField = IntField("saleNum", salenum, Field.Store.YES) priceField = DoubleField("price", price, Field.Store.YES) urlField = Field("url", url, noIndexedString) pictureField = StringField("pictureName", picturename, Field.Store.YES) commentField = Field("comments", comment, noIndexedString) historyPriceField = Field("historyPrice", historyprice, noIndexedString) doc = Document() doc.add(shopnameField) doc.add(shopnameField_s) doc.add(goodnameField) doc.add(goodnameField_s) doc.add(salenumField) doc.add(priceField) doc.add(urlField) doc.add(pictureField) doc.add(commentField) doc.add(historyPriceField) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def obj_to_document(obj): def conv_to_str(x): if isinstance(x, unicode): return x.encode('utf8') return str(x) res = Document() res.add(StringField('index', conv_to_str(obj.index), Field.Store.YES)) res.add(StringField('type', obj.__class__.__name__, Field.Store.YES)) for k, v in vars(obj.data).items(): if v is None: res.add(Field(k, '', Field.Store.YES, Field.Index.NO)) fieldtype = LT_NONE elif isinstance(v, list): if len(v) > 0 and isinstance(v[0], int): res.add( TextField(k, ' '.join((str(x) for x in set(v))), Field.Store.YES)) fieldtype = LT_INTLIST else: res.add(TextField(k, ' '.join(list(set(v))), Field.Store.YES)) fieldtype = LT_LIST elif isinstance(v, str) or isinstance(v, unicode): res.add(Field(k, v, Field.Store.YES, Field.Index.NO)) res.add( TextField(k + LTPF_FOR_QUERY, ' '.join(jieba.lcut_for_search(v)), Field.Store.NO)) fieldtype = LT_STRING elif isinstance(v, hyper_text): res.add(Field(k, v.raw, Field.Store.YES, Field.Index.NO)) res.add( TextField(k + LTPF_FOR_QUERY, ' '.join(jieba.lcut_for_search(v.text)), Field.Store.NO)) fieldtype = LT_HYPERTEXT elif isinstance(v, bool): if v: vs = '1' else: vs = '0' res.add(StringField(k, vs, Field.Store.YES)) fieldtype = LT_BOOL elif isinstance(v, int) or isinstance(v, long): res.add(StringField(k, str(v), Field.Store.YES)) fieldtype = LT_INT else: raise Exception('unrecognized data type') res.add( Field(k + LTPF_TYPE, fieldtype, Field.Store.YES, Field.Index.NO)) return res
def IndexDocs(self, documents): """ Index documents under the directory :Parameters: - `documents`: Documents to be indexed (List) """ # Get the Writer Configuration writerConfig = IndexWriterConfig(self.__analyzer) # Get index writer writer = IndexWriter(self.__indexDir, writerConfig) for document in documents: # Create a document that would we added to the index doc = Document() # Add a field to this document doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES)) doc.add( Field(Indexer.CONTENT, document['content'], self.__contentType)) doc.add( StringField(Indexer.DATE, document['date'], Field.Store.YES)) doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES)) doc.add( TextField(Indexer.TAGS, self.__qualifyTags(document['tags']), Field.Store.YES)) doc.add( LongPoint(Indexer.TIMESTAMP, self.__getTimestamp(document['date']))) # Add or update the document to the index if not self.__boAppend: # New index, so we just add the document (no old document can be there): if self.__verbose: print("Adding " + document['name']) writer.addDocument(doc) else: # Existing index (an old copy of this document may have been indexed) so # we use updateDocument instead to replace the old one matching the exact # path, if present: if self.__verbose: print("Updating " + document['name']) writer.updateDocument(Term(Indexer.NAME, document['name']), doc) # Print index information and close writer print("Indexed %d documents (%d docs in index)" % (len(documents), writer.numDocs())) writer.close()
def addDoc(w, data): doc = Document() #print ('----------------------------') for field in data: value, type = data[field][0], data[field][1] ''' print ('field:%s type:%s'%(field,type)) print (value+'\n') ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_DF': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF)) elif type == 'CUSTOM_FIELD_TEXT_BF': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') w.addDocument(doc)
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: print("adding", filename) path = os.path.join(root, filename) file = open(path, encoding='utf8') url = file.readline() title = file.readline() contents = file.read() file.close() img_url = self.getTxtAttribute(contents, 'img_url') img_info = self.getTxtAttribute(contents, 'img_info') for i in range(len(img_url)): if len(img_info[i]) > 0: title = title doc = Document() doc.add(StringField('title', title, Field.Store.YES)) doc.add(StringField('url', url, Field.Store.YES)) doc.add( StringField('img_url', img_url[i], Field.Store.YES)) seg_contents = jieba.lcut_for_search(img_info[i]) contents = ' '.join(seg_contents) doc.add( TextField('contents', contents, Field.Store.YES)) writer.addDocument(doc) else: continue
def addDoc(w, data): doc = Document() for field in data: value, type = data[field][0], data[field][1] ''' if type!='INTEGER_STORED': #print ('field=%s len=%d'%(field,len(value))) print ('field=%s value=%s'%(field,value)) else: print ('field=%s value=%d'%(field,value)) ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') try: w.addDocument(doc) except: #print ('error cat=%s'%(data['category'][0])) print('-----------------------------------') for field in data: value, type = data[field][0], data[field][1] print('field=%s\nvalue=%s' % (field, str(value)))
def indexDocs(self, root, writer): path = root + "/data/*/*.xml" # print(path) xml_files = glob.glob(path) # xml_files = ["HAM2-031201.xml"] numDocs = 0 for xml in xml_files: try: parser = etree.XMLParser(recover=False, strip_cdata=False) tree = etree.parse(xml, parser=parser) except etree.XMLSyntaxError as e: parser = etree.XMLParser(recover=True, strip_cdata=False) tree = etree.parse(xml, parser=parser) root = tree.getroot() for text in root.iter("TEXT"): contents = "".join(text.xpath("text()")).strip() doc_no = text.getparent().find("DOCNO").text # print("adding", doc_no) try: doc = Document() doc.add(StringField("id", doc_no, Field.Store.YES)) if len(contents) > 0: doc.add( TextField("contents", contents, Field.Store.YES)) else: pass # print("warning: no content in %s" % doc_no) writer.addDocument(doc) numDocs += 1 except Exception as e: print("Failed in indexDocs:", e) return numDocs
def addDoc(w, name, birth_date, death_date, birth_note, death_note): doc = Document() doc.add(TextField("name", name, Field.Store.YES)) doc.add(StringField("birth_date", birth_date, Field.Store.YES)) doc.add(StringField("death_date", death_date, Field.Store.YES)) doc.add(StringField("birth_note", birth_note, Field.Store.YES)) doc.add(StringField("death_note", death_note, Field.Store.YES)) w.addDocument(doc)
def create_document(file_name): path = './alldocs/' + file_name file = open(path) doc = Document() doc.add(StringField("title", input_file, Field.Store.YES)) doc.add(TextField("text", file.read(), Field.Store.YES)) file.close() return doc
def create_doc(data): screen_name = data['screen_name'] tweet = data['tweet'] tweet_date = data['tweet_date'] tweet_location = data['tweet_location'] page_title = data['page_title'] doc = Document() doc.add(TextField("username", screen_name, Field.Store.YES)) doc.add(TextField("text", tweet, Field.Store.YES)) doc.add(TextField("date", tweet_date, Field.Store.YES)) if tweet_location: doc.add(TextField("location", tweet_location, Field.Store.YES)) if page_title: doc.add(TextField("page title", page_title, Field.Store.YES)) return doc
def index_document(self, wiki_doc): """ :param wiki_doc: the document to be indexed. :return: """ # Method that indexes documents i = 0 for section in wiki_doc.sections: doc = Document() doc.add(StringField("id_article", wiki_doc.id, Field.Store.YES)) doc.add(TextField("title_article", wiki_doc.title, Field.Store.YES)) doc.add(StringField("id_section", str( wiki_doc.id) + "_" + str(i), Field.Store.YES)) doc.add(TextField("title_section", section.title, Field.Store.YES)) doc.add(TextField("content_section", section.text, Field.Store.YES)) self.writer.addDocument(doc) i += 1
def addDocument(self, id): global answers_train preA = answers_train[id] doc = Document() doc.add(TextField("pa", preA, Field.Store.YES)) doc.add(StringField("id", str(id), Field.Store.YES)) self.w.addDocument(doc) self.w.commit()
def addDoc(w, doc_name, text, file_name): """ add single doc to the index :param w: writer :param doc_name: :param text: :param file_name: :return: """ doc = Document() # TextField: sequence of terms: tokenized doc.add(TextField("text", text, Field.Store.YES)) # StringField: character strings with all punctuation, spacing, and case preserved. doc.add(TextField('doc_name', doc_name, Field.Store.YES)) #doc.add(StringField('corpus_name', file_name, Field.Store.YES)) doc.add(TextField('corpus_name', file_name, Field.Store.YES)) w.addDocument(doc)
def create_document(file_name): path = INPUT_DIR + file_name # assemble the file descriptor file = open(path) # open in read mode doc = Document() # create a new document # add the title field doc.add(StringField("title", input_file, Field.Store.YES)) # add the whole book doc.add(TextField("text", file.read(), Field.Store.YES)) file.close() # close the file pointer return doc
def get_doc(self, doc_info, contents): ''' Generate a `Document` according to the given info. Input: `doc_info`: info of the doc (`name`, `path`, `title`, `url`, `site`) `contents`: contents of the webpage Output: `Document` with the fields initialized ''' doc = Document() doc.add(StringField("name", doc_info['name'], Field.Store.YES)) doc.add(StringField("path", doc_info['path'], Field.Store.YES)) doc.add(StringField("title", doc_info['title'], Field.Store.YES)) doc.add(StringField("url", doc_info['url'], Field.Store.YES)) doc.add(TextField("site", doc_info['site'], Field.Store.YES)) if len(contents) > 0: doc.add(TextField("contents", contents, Field.Store.YES)) else: print("Warning: No content in {}".format(doc_info['name'])) return doc
def retrival_answer(MAX): lucene.initVM() directory = RAMDirectory() indexDir = SimpleFSDirectory(Paths.get('index')) writerConfig = IndexWriterConfig(StandardAnalyzer()) writer = IndexWriter(directory, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from Document..." process_doc = open("Huawei_result/document.txt", "r") doc_line = process_doc.readlines() for l in doc_line: doc = Document() doc.add(TextField("text", l, Field.Store.YES)) writer.addDocument(doc) print "Indexed from %d docs in index" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close() accuracy = [] process_query = open("Huawei_result/query.txt", "r") query_line = process_query.readlines() for n, one_query in enumerate(query_line): analyzer = StandardAnalyzer() # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index'))) searcher = IndexSearcher(DirectoryReader.open(directory)) # searcher = IndexSearcher(reader) query = QueryParser("text", analyzer).parse(one_query) hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) # print "The groundtruth document is:", doc_line[n] candidate_doc = [] for hit in hits.scoreDocs: # print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) # print doc.get("text").encode("utf-8") candidate_doc.append(doc.get("text")) choices = process.extract(unicode(doc_line[n]), candidate_doc) flag = 0 for i in range(len(choices)): if choices[i][1] >= 89: flag = 1 if flag == 1: accuracy.append(1) else: accuracy.append(0) final_accuracy = float(sum(accuracy)) / float(len(accuracy)) print "the final accuracy is:", final_accuracy
def get_doc(self, img): ''' Generate a `Document` according to the parameters. Input: `img`: dict containing a single image info Output: `Document` with the fields initialized ''' doc = Document() doc.add(StringField("img_url", img['img_url'], Field.Store.YES)) doc.add(TextField("description", img['description'], Field.Store.YES)) doc.add(StringField("url", img['url'], Field.Store.YES)) doc.add(StringField("url_title", img['url_title'], Field.Store.YES)) return doc
def addDoc(w, text): """ add single doc to the index :param w: writer :param doc_name: :param text: :param file_name: :return: """ doc = Document() # TextField: sequence of terms: tokenized doc.add(TextField("text", text, Field.Store.YES)) w.addDocument(doc)