Esempio n. 1
0
 def index_single_file(self, doc_file):
     logger.info("adding {}".format(doc_file))
     lucene_doc_num = 0
     try:
         with open(doc_file) as df:
             for line in df:
                 wiki_doc = json.loads(line)
                 doc_title = wiki_doc['title']
                 doc_text = wiki_doc['plaintext']
                 doc_id = wiki_doc['_id']
                 paragraphs = doc_text.split('\n\n')
                 if len(paragraphs) < 3:
                     continue
                 doc_text = rm_special_chars(doc_text)
                 doc = Document()
                 doc.add(StringField("id", str(doc_id), Field.Store.YES))
                 doc.add(TextField("title", doc_title, Field.Store.YES))
                 doc.add(TextField("text", doc_text, Field.Store.YES))
                 self.writer.addDocument(doc)
                 lucene_doc_num += 1
                 if lucene_doc_num % 10000 == 0:
                     logger.info('added {} lucene docs'.format(lucene_doc_num))
     except Exception as e:
         import traceback
         traceback.print_tb(e.__traceback__)
         logger.error("Failed in: {}".format(doc_file))
     return lucene_doc_num
Esempio n. 2
0
 def get_document(self, title, body, tags, date):
     doc = Document()
     doc.add(TextField(self.INDEX_FILED_TITLE, title, Field.Store.YES))
     doc.add(TextField(self.INDEX_FILED_BODY, body, Field.Store.YES))
     doc.add(TextField(self.INDEX_FILED_TAGS, tags, Field.Store.YES))
     doc.add(TextField(self.INDEX_FILED_DATE, date, Field.Store.YES))
     return doc
Esempio n. 3
0
    def indexDocs(self, root, writer):   
        for root,dirnames,filenames in os.walk(root):
            for dirname in dirnames: #遍历文件夹
                path1 = os.path.join(root,dirname)
                for trivial1 , trivial2 , filenames in os.walk(path1): #遍历文件夹下的文件
                    for filename in filenames:
                        #print(root,dirnames,filename)
                        print("adding", filename)
                        # try:
                        path = os.path.join(path1, filename)
                        file = open(path, encoding='utf8')
                        page = file.readline()
                        title = file.readline()
                        contents = file.read()
                        file.close()

                        # jieba 分词
                        seg_contents = jieba.lcut_for_search(contents)
                        contents = ' '.join(seg_contents)
                        url = page
                        seg_url = jieba.lcut_for_search(page)
                        page = ' '.join(list(set(seg_url)-set(['.','http','https','/',':','?','=','html','shtml','www'])))

                        doc = Document()
                        doc.add(StringField("name", filename, Field.Store.YES))
                        doc.add(StringField("path", path, Field.Store.YES))
                        if len(contents) > 0:
                            doc.add(TextField('title', title, Field.Store.YES))
                            doc.add(TextField('site', page, Field.Store.YES))
                            doc.add(TextField('url',url,Field.Store.YES))
                            doc.add(TextField('contents', contents, Field.Store.YES))
                        else:
                            print("warning: no content in %s" % filename)
                        writer.addDocument(doc)
Esempio n. 4
0
 def index_one(self, article):
     """Create index for one url object in the database.
     """
     try:
         date_published_str = article['date_published'].strftime(
             self.date_format)
     except Exception as e:
         logger.warning('Error when formating date_published %r: %s ',
                        article['canonical_url'], e)
         return
     doc = Document()
     doc.add(StoredField('group_id', article['group_id']))
     doc.add(StoredField('article_id', article['article_id']))
     doc.add(
         StringField('date_published', date_published_str, Field.Store.YES))
     doc.add(
         SortedDocValuesField('date_published',
                              BytesRef(date_published_str)))
     doc.add(StoredField('date_published', date_published_str))
     doc.add(StringField('domain', article['domain'], Field.Store.YES))
     doc.add(StringField('site_type', article['site_type'],
                         Field.Store.YES))
     doc.add(
         TextField('canonical_url', article['canonical_url'],
                   Field.Store.YES))
     doc.add(TextField('title', article['title'], Field.Store.YES))
     doc.add(TextField('meta', article['meta'], Field.Store.NO))
     doc.add(TextField('content', article['content'], Field.Store.NO))
     doc.add(StoredField('uq_id_str', article['uq_id_str']))
     self.writer.addDocument(doc)
Esempio n. 5
0
	def getDoc(self, file):
		try:
			f = open(os.getcwd()+FILE_DIR+'/'+file, "r")

			try:
				c = []
				s = BeautifulSoup(f, 'html.parser')
				text = s.findAll(text=True)
				c = filter(tag_vis, text)
				try:
					c = ' '.join(c)
				except Exception as e:
					c = b' '.join(c)
			except Exception as e:
				print(str(e))
				return
			content = TextField("contents", c, Field.Store.YES)
			fileName = str(Paths.get(file)).split('/')[-1]
			fileName = fileName[:fileName.find(".")]
			filename = TextField("filename",
							 fileName,
							 Field.Store.YES)
			path = TextField("filepath",
						 str(os.getcwd()+FILE_DIR+'/'+file),
						 Field.Store.NO)
			doc = Document()
			doc.add(content)
			doc.add(filename)
			doc.add(path)
			return doc
		except Exception as e:
			print(type(Exception).__name__)
			print(str(e))
			return
Esempio n. 6
0
    def get_doc(self, filename, path, title, url, contents):
        '''
        Generate a `Document` according to the parameters.

        Input: `filename`: filename of the webpage
               `path`: path of the webpage
               `title`: title of the webpage
               `url`: original url of the webpage
               `contents`: contents of the webpage
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        # doc.add(Field("name", filename, self.property_type))
        # doc.add(Field("path", path, self.property_type))
        # doc.add(Field("title", title, self.property_type))
        # doc.add(Field("url", url, self.property_type))
        doc.add(StringField("name", filename, Field.Store.YES))
        doc.add(StringField("path", path, Field.Store.YES))
        doc.add(TextField("title", title, Field.Store.YES))
        doc.add(TextField("url", url, Field.Store.YES))
        if len(contents) > 0:
            # doc.add(Field("contents", contents, self.content_type))
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(filename))
        return doc
Esempio n. 7
0
    def index (cls, indexDir, taxoDir):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        from org.apache.lucene.util import Version
        config = IndexWriterConfig(Version.LUCENE_42,
                                   WhitespaceAnalyzer(Version.LUCENE_42))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE)
        # FacetFields is a utility class for adding facet fields to a document:
        facet_fields = FacetFields(taxo)

        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # obtain the sample facets for current document
            facets = categories[docNum]
            facetList = [CategoryPath(f) for f in facets]
            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
            #       Python list in order to to pass a proper argument to setCategoryPaths.
            #       We use java.util.Arrays (via JCC) to create a Java List:
            facetList = Arrays.asList(facetList)

            # NOTE: we could use lucene.collections here as well in order to convert our
            # Python list to a Java based list using the JavaList class (JavaList implements
            # java.util.List around a Python list instance it wraps):
            #  from lucene.collections import JavaList
            #  facetList = JavaList(facetList)

            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # use the FacetFields utility class for adding facet fields (i.e. the categories)
            # to the document (and, as required, to the taxonomy index)
            facet_fields.addFields(doc, facetList)
            # finally add the document to the index
            iw.addDocument(doc)
            nDocsAdded +=1
            nFacetsAdded += facetList.size()
        # end for

        # commit changes.
        # we commit changes to the taxonomy index prior to committing them to the search index.
        # this is important, so that all facets referred to by documents in the search index
        # will indeed exist in the taxonomy index.
        taxo.commit()
        iw.commit()

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        taxo.close()
        iw.close()
        print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)
Esempio n. 8
0
 def index(self, eid, name, cname, type, info):
     doc = Document()
     doc.add(TextField('id', eid, Field.Store.YES))
     doc.add(TextField('name', name, Field.Store.YES))
     doc.add(TextField('CannonicalName', cname, Field.Store.YES))
     doc.add(TextField('type', type, Field.Store.YES))
     doc.add(TextField('info', info, Field.Store.YES))
     self.writer.addDocument(doc)
def create_document_by_document_sentence(org_title, preprocessed_title, doc_id,
                                         sentence):
    doc = Document()  # create a new document
    doc.add(StringField("org_title", org_title, Field.Store.YES))
    doc.add(
        TextField("preprocessed_title", preprocessed_title, Field.Store.YES))
    doc.add(StringField("doc_id", str(doc_id), Field.Store.YES))
    # doc.add(StringField("content", content, Field.Store.YES))
    doc.add(TextField("sentence", sentence, Field.Store.YES))
    return doc
Esempio n. 10
0
def index_files():
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    BASE_DIR = path.dirname(path.abspath(sys.argv[0]))
    INPUT_DIR = BASE_DIR + "/input/"
    INDEX_DIR = BASE_DIR + "/lucene_index/"

    NoT = 100000  # Number of Tokens
    print "------------------------------------------------------"
    print "PyLucene Demo started (lucene_demo.py)"
    print "Python version: %d.%d.%d" % (
        sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
    print 'Lucene version:', lucene.VERSION
    print "------------------------------------------------------\n"
    # lucene.initVM()

    # directory = RAMDirectory()
    index_path = Paths.get(INDEX_DIR)
    directory = SimpleFSDirectory(index_path)

    analyzer = StandardAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, NoT)
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, config)

    print "Number of indexed documents: %d\n" % writer.numDocs()
    for input_file in listdir(INPUT_DIR):  # iterate over all input files
        print "Current file:", input_file
        if input_file.endswith(".json"):
            with open(INPUT_DIR + input_file) as f:
                for line in f:
                    # doc = create_document(line, input_file) # call the create_document function
                    o = json.loads(line)
                    doc = Document()  # create a new document
                    doc.add(TextField("filename", input_file, Field.Store.YES))
                    # print file
                    doc.add(
                        TextField("username", o['user']['screen_name'],
                                  Field.Store.YES))
                    # print "username: "******"text", o['text'], Field.Store.YES))
                    # print "text: " + o['text']
                    if o['user']['location']:
                        doc.add(
                            TextField("location", o['user']['location'],
                                      Field.Store.YES))
                        # print "location: " + o['user']['location']
                    doc.add(TextField("time", o['created_at'],
                                      Field.Store.YES))
                    writer.addDocument(
                        doc)  # add the document to the IndexWriter
    print "\nNumber of indexed documents: %d" % writer.numDocs()
    writer.close()
    print "Finished\n"
    print "-----------------------------------------------------"
def create_document_by_document_content(org_title, preprocessed_title,
                                        preprocessed_title_lower, content):
    doc = Document()  # create a new document
    doc.add(StringField("org_title", org_title, Field.Store.YES))
    doc.add(
        TextField("preprocessed_title", preprocessed_title, Field.Store.YES))
    doc.add(
        StringField("preprocessed_title_lower", preprocessed_title_lower,
                    Field.Store.YES))
    # doc.add(StringField("content", content, Field.Store.YES))
    doc.add(TextField("content", content, Field.Store.YES))
    return doc
Esempio n. 12
0
def createDocument_tweet(data):
    jsonText = data['text']  #accesses tweet
    jsonName = data['user']['screen_name']  #accesses username
    jsonLocation = data['coordinates']

    doc = Document()
    #added fields
    doc.add(TextField("tweet", jsonText, Field.Store.YES))
    doc.add(TextField("u_name", jsonName, Field.Store.YES))
    # doc.add(TextField("date", jsonDate, Field.Store.YES))

    # print jsonText
    return doc
Esempio n. 13
0
    def testAdd(self, goodname, salenum, price, shopname, url, picturename, comment, historyprice):
        analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(self.dir, config)
        # True,建立新索引,False,建立增量索引

        noIndexedString = FieldType()
        noIndexedString.setTokenized(False)
        noIndexedString.setIndexed(False)
        noIndexedString.setStored(True)

        try:
            print "adding", goodname

            goodname_s = unicode(goodname, 'utf8')
            seg_list_good = jieba.cut(goodname_s, cut_all=False)
            goodname_s = " ".join(seg_list_good)  # 默认模式

            shopname_s = unicode(shopname, 'utf8')
            seg_list_shop = jieba.cut(shopname_s, cut_all=False)
            shopname_s = " ".join(seg_list_shop)  # 默认模式

            shopnameField = Field("shopName", shopname, noIndexedString)
            shopnameField_s = TextField("shopName_s", shopname_s, Field.Store.NO)
            goodnameField = Field("goodName", goodname, noIndexedString)
            goodnameField_s = TextField("goodName_s", goodname_s, Field.Store.NO)
            salenumField = IntField("saleNum", salenum, Field.Store.YES)
            priceField = DoubleField("price", price, Field.Store.YES)
            urlField = Field("url", url, noIndexedString)
            pictureField = StringField("pictureName", picturename, Field.Store.YES)
            commentField = Field("comments", comment, noIndexedString)
            historyPriceField = Field("historyPrice", historyprice, noIndexedString)

            doc = Document()
            doc.add(shopnameField)
            doc.add(shopnameField_s)
            doc.add(goodnameField)
            doc.add(goodnameField_s)
            doc.add(salenumField)
            doc.add(priceField)
            doc.add(urlField)
            doc.add(pictureField)
            doc.add(commentField)
            doc.add(historyPriceField)

            writer.addDocument(doc)
        except Exception, e:
            print "Failed in indexDocs:", e
def obj_to_document(obj):
    def conv_to_str(x):
        if isinstance(x, unicode):
            return x.encode('utf8')
        return str(x)

    res = Document()
    res.add(StringField('index', conv_to_str(obj.index), Field.Store.YES))
    res.add(StringField('type', obj.__class__.__name__, Field.Store.YES))
    for k, v in vars(obj.data).items():
        if v is None:
            res.add(Field(k, '', Field.Store.YES, Field.Index.NO))
            fieldtype = LT_NONE
        elif isinstance(v, list):
            if len(v) > 0 and isinstance(v[0], int):
                res.add(
                    TextField(k, ' '.join((str(x) for x in set(v))),
                              Field.Store.YES))
                fieldtype = LT_INTLIST
            else:
                res.add(TextField(k, ' '.join(list(set(v))), Field.Store.YES))
                fieldtype = LT_LIST
        elif isinstance(v, str) or isinstance(v, unicode):
            res.add(Field(k, v, Field.Store.YES, Field.Index.NO))
            res.add(
                TextField(k + LTPF_FOR_QUERY,
                          ' '.join(jieba.lcut_for_search(v)), Field.Store.NO))
            fieldtype = LT_STRING
        elif isinstance(v, hyper_text):
            res.add(Field(k, v.raw, Field.Store.YES, Field.Index.NO))
            res.add(
                TextField(k + LTPF_FOR_QUERY,
                          ' '.join(jieba.lcut_for_search(v.text)),
                          Field.Store.NO))
            fieldtype = LT_HYPERTEXT
        elif isinstance(v, bool):
            if v:
                vs = '1'
            else:
                vs = '0'
            res.add(StringField(k, vs, Field.Store.YES))
            fieldtype = LT_BOOL
        elif isinstance(v, int) or isinstance(v, long):
            res.add(StringField(k, str(v), Field.Store.YES))
            fieldtype = LT_INT
        else:
            raise Exception('unrecognized data type')
        res.add(
            Field(k + LTPF_TYPE, fieldtype, Field.Store.YES, Field.Index.NO))
    return res
Esempio n. 15
0
    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()
def addDoc(w, data):
    doc = Document()
    #print ('----------------------------')
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        print ('field:%s  type:%s'%(field,type))
        print (value+'\n')
        '''
        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_DF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF))
        elif type == 'CUSTOM_FIELD_TEXT_BF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    w.addDocument(doc)
Esempio n. 17
0
    def indexDocs(self, root, writer):
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                print("adding", filename)
                path = os.path.join(root, filename)
                file = open(path, encoding='utf8')
                url = file.readline()
                title = file.readline()
                contents = file.read()
                file.close()
                img_url = self.getTxtAttribute(contents, 'img_url')
                img_info = self.getTxtAttribute(contents, 'img_info')
                for i in range(len(img_url)):
                    if len(img_info[i]) > 0:
                        title = title
                        doc = Document()

                        doc.add(StringField('title', title, Field.Store.YES))
                        doc.add(StringField('url', url, Field.Store.YES))
                        doc.add(
                            StringField('img_url', img_url[i],
                                        Field.Store.YES))
                        seg_contents = jieba.lcut_for_search(img_info[i])
                        contents = ' '.join(seg_contents)
                        doc.add(
                            TextField('contents', contents, Field.Store.YES))
                        writer.addDocument(doc)
                    else:
                        continue
def addDoc(w, data):
    doc = Document()
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        if type!='INTEGER_STORED':
           #print ('field=%s  len=%d'%(field,len(value)))
           print ('field=%s  value=%s'%(field,value))
        else:
           print ('field=%s  value=%d'%(field,value))
        '''

        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    try:
        w.addDocument(doc)
    except:
        #print ('error cat=%s'%(data['category'][0]))
        print('-----------------------------------')
        for field in data:
            value, type = data[field][0], data[field][1]
            print('field=%s\nvalue=%s' % (field, str(value)))
Esempio n. 19
0
    def indexDocs(self, root, writer):
        path = root + "/data/*/*.xml"
        # print(path)
        xml_files = glob.glob(path)
        # xml_files = ["HAM2-031201.xml"]
        numDocs = 0
        for xml in xml_files:
            try:
                parser = etree.XMLParser(recover=False, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            except etree.XMLSyntaxError as e:
                parser = etree.XMLParser(recover=True, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            root = tree.getroot()
            for text in root.iter("TEXT"):
                contents = "".join(text.xpath("text()")).strip()
                doc_no = text.getparent().find("DOCNO").text
                # print("adding", doc_no)
                try:
                    doc = Document()
                    doc.add(StringField("id", doc_no, Field.Store.YES))
                    if len(contents) > 0:
                        doc.add(
                            TextField("contents", contents, Field.Store.YES))
                    else:
                        pass
                        # print("warning: no content in %s" % doc_no)
                    writer.addDocument(doc)
                    numDocs += 1
                except Exception as e:
                    print("Failed in indexDocs:", e)
        return numDocs
Esempio n. 20
0
def addDoc(w, name, birth_date, death_date, birth_note, death_note):
    doc = Document()
    doc.add(TextField("name", name, Field.Store.YES))
    doc.add(StringField("birth_date", birth_date, Field.Store.YES))
    doc.add(StringField("death_date", death_date, Field.Store.YES))
    doc.add(StringField("birth_note", birth_note, Field.Store.YES))
    doc.add(StringField("death_note", death_note, Field.Store.YES))
    w.addDocument(doc)
Esempio n. 21
0
def create_document(file_name):
    path = './alldocs/' + file_name
    file = open(path)
    doc = Document()
    doc.add(StringField("title", input_file, Field.Store.YES))
    doc.add(TextField("text", file.read(), Field.Store.YES))
    file.close()
    return doc
Esempio n. 22
0
def create_doc(data):
    screen_name = data['screen_name']
    tweet = data['tweet']
    tweet_date = data['tweet_date']
    tweet_location = data['tweet_location']
    page_title = data['page_title']

    doc = Document()
    doc.add(TextField("username", screen_name, Field.Store.YES))
    doc.add(TextField("text", tweet, Field.Store.YES))
    doc.add(TextField("date", tweet_date, Field.Store.YES))
    if tweet_location:
        doc.add(TextField("location", tweet_location, Field.Store.YES))
    if page_title:
        doc.add(TextField("page title", page_title, Field.Store.YES))

    return doc
Esempio n. 23
0
 def index_document(self, wiki_doc):
     """
     :param wiki_doc: the document to be indexed.
     :return:
     """
     # Method that indexes documents
     i = 0
     for section in wiki_doc.sections:
         doc = Document()
         doc.add(StringField("id_article", wiki_doc.id, Field.Store.YES))
         doc.add(TextField("title_article", wiki_doc.title, Field.Store.YES))
         doc.add(StringField("id_section", str(
             wiki_doc.id) + "_" + str(i), Field.Store.YES))
         doc.add(TextField("title_section", section.title, Field.Store.YES))
         doc.add(TextField("content_section", section.text, Field.Store.YES))
         self.writer.addDocument(doc)
         i += 1
 def addDocument(self, id):
     global answers_train
     preA = answers_train[id]
     doc = Document()
     doc.add(TextField("pa", preA, Field.Store.YES))
     doc.add(StringField("id", str(id), Field.Store.YES))
     self.w.addDocument(doc)
     self.w.commit()
Esempio n. 25
0
def addDoc(w, doc_name, text, file_name):
    """
    add single doc to the index
    :param w: writer
    :param doc_name:
    :param text:
    :param file_name:
    :return:
    """
    doc = Document()
    # TextField: sequence of terms: tokenized
    doc.add(TextField("text", text, Field.Store.YES))
    # StringField: character strings with all punctuation, spacing, and case preserved.
    doc.add(TextField('doc_name', doc_name, Field.Store.YES))
    #doc.add(StringField('corpus_name', file_name, Field.Store.YES))

    doc.add(TextField('corpus_name', file_name, Field.Store.YES))
    w.addDocument(doc)
Esempio n. 26
0
def create_document(file_name):
    path = INPUT_DIR + file_name  # assemble the file descriptor
    file = open(path)  # open in read mode
    doc = Document()  # create a new document
    # add the title field
    doc.add(StringField("title", input_file, Field.Store.YES))
    # add the whole book
    doc.add(TextField("text", file.read(), Field.Store.YES))
    file.close()  # close the file pointer
    return doc
Esempio n. 27
0
    def get_doc(self, doc_info, contents):
        '''
        Generate a `Document` according to the given info.

        Input: `doc_info`: info of the doc (`name`, `path`, `title`, `url`, `site`)
               `contents`: contents of the webpage
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        doc.add(StringField("name", doc_info['name'], Field.Store.YES))
        doc.add(StringField("path", doc_info['path'], Field.Store.YES))
        doc.add(StringField("title", doc_info['title'], Field.Store.YES))
        doc.add(StringField("url", doc_info['url'], Field.Store.YES))
        doc.add(TextField("site", doc_info['site'], Field.Store.YES))
        if len(contents) > 0:
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(doc_info['name']))
        return doc
Esempio n. 28
0
def retrival_answer(MAX):
    lucene.initVM()
    directory = RAMDirectory()

    indexDir = SimpleFSDirectory(Paths.get('index'))
    writerConfig = IndexWriterConfig(StandardAnalyzer())
    writer = IndexWriter(directory, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading lines from Document..."

    process_doc = open("Huawei_result/document.txt", "r")
    doc_line = process_doc.readlines()
    for l in doc_line:
        doc = Document()
        doc.add(TextField("text", l, Field.Store.YES))
        writer.addDocument(doc)
    print "Indexed from %d docs in index" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

    accuracy = []
    process_query = open("Huawei_result/query.txt", "r")
    query_line = process_query.readlines()
    for n, one_query in enumerate(query_line):
        analyzer = StandardAnalyzer()
        # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index')))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        # searcher = IndexSearcher(reader)
        query = QueryParser("text", analyzer).parse(one_query)
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        # print "The groundtruth document is:", doc_line[n]
        candidate_doc = []
        for hit in hits.scoreDocs:
            # print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            # print doc.get("text").encode("utf-8")
            candidate_doc.append(doc.get("text"))

        choices = process.extract(unicode(doc_line[n]), candidate_doc)
        flag = 0
        for i in range(len(choices)):
            if choices[i][1] >= 89:
                flag = 1
        if flag == 1:
            accuracy.append(1)
        else:
            accuracy.append(0)

    final_accuracy = float(sum(accuracy)) / float(len(accuracy))

    print "the final accuracy is:", final_accuracy
Esempio n. 29
0
    def get_doc(self, img):
        '''
        Generate a `Document` according to the parameters.

        Input: `img`: dict containing a single image info
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        doc.add(StringField("img_url", img['img_url'], Field.Store.YES))
        doc.add(TextField("description", img['description'], Field.Store.YES))
        doc.add(StringField("url", img['url'], Field.Store.YES))
        doc.add(StringField("url_title", img['url_title'], Field.Store.YES))
        return doc
Esempio n. 30
0
def addDoc(w, text):
    """
    add single doc to the index
    :param w: writer
    :param doc_name:
    :param text:
    :param file_name:
    :return:
    """
    doc = Document()
    # TextField: sequence of terms: tokenized
    doc.add(TextField("text", text, Field.Store.YES))
    w.addDocument(doc)