Python TextField.TextFieldの例、org.apache.lucene.document.TextField.TextField Pythonの例

コード例 #1

0

ファイルを表示

 def index_single_file(self, doc_file):
     logger.info("adding {}".format(doc_file))
     lucene_doc_num = 0
     try:
         with open(doc_file) as df:
             for line in df:
                 wiki_doc = json.loads(line)
                 doc_title = wiki_doc['title']
                 doc_text = wiki_doc['plaintext']
                 doc_id = wiki_doc['_id']
                 paragraphs = doc_text.split('\n\n')
                 if len(paragraphs) < 3:
                     continue
                 doc_text = rm_special_chars(doc_text)
                 doc = Document()
                 doc.add(StringField("id", str(doc_id), Field.Store.YES))
                 doc.add(TextField("title", doc_title, Field.Store.YES))
                 doc.add(TextField("text", doc_text, Field.Store.YES))
                 self.writer.addDocument(doc)
                 lucene_doc_num += 1
                 if lucene_doc_num % 10000 == 0:
                     logger.info('added {} lucene docs'.format(lucene_doc_num))
     except Exception as e:
         import traceback
         traceback.print_tb(e.__traceback__)
         logger.error("Failed in: {}".format(doc_file))
     return lucene_doc_num

コード例 #2

0

ファイルを表示

 def get_document(self, title, body, tags, date):
     doc = Document()
     doc.add(TextField(self.INDEX_FILED_TITLE, title, Field.Store.YES))
     doc.add(TextField(self.INDEX_FILED_BODY, body, Field.Store.YES))
     doc.add(TextField(self.INDEX_FILED_TAGS, tags, Field.Store.YES))
     doc.add(TextField(self.INDEX_FILED_DATE, date, Field.Store.YES))
     return doc

コード例 #3

0

ファイルを表示

ファイル: IndexFiles.py プロジェクト: elfdown/ee208

    def indexDocs(self, root, writer):   
        for root,dirnames,filenames in os.walk(root):
            for dirname in dirnames: #遍历文件夹
                path1 = os.path.join(root,dirname)
                for trivial1 , trivial2 , filenames in os.walk(path1): #遍历文件夹下的文件
                    for filename in filenames:
                        #print(root,dirnames,filename)
                        print("adding", filename)
                        # try:
                        path = os.path.join(path1, filename)
                        file = open(path, encoding='utf8')
                        page = file.readline()
                        title = file.readline()
                        contents = file.read()
                        file.close()

                        # jieba 分词
                        seg_contents = jieba.lcut_for_search(contents)
                        contents = ' '.join(seg_contents)
                        url = page
                        seg_url = jieba.lcut_for_search(page)
                        page = ' '.join(list(set(seg_url)-set(['.','http','https','/',':','?','=','html','shtml','www'])))

                        doc = Document()
                        doc.add(StringField("name", filename, Field.Store.YES))
                        doc.add(StringField("path", path, Field.Store.YES))
                        if len(contents) > 0:
                            doc.add(TextField('title', title, Field.Store.YES))
                            doc.add(TextField('site', page, Field.Store.YES))
                            doc.add(TextField('url',url,Field.Store.YES))
                            doc.add(TextField('contents', contents, Field.Store.YES))
                        else:
                            print("warning: no content in %s" % filename)
                        writer.addDocument(doc)

コード例 #4

0

ファイルを表示

ファイル: index.py プロジェクト: zjcom/hoaxy-backend

 def index_one(self, article):
     """Create index for one url object in the database.
     """
     try:
         date_published_str = article['date_published'].strftime(
             self.date_format)
     except Exception as e:
         logger.warning('Error when formating date_published %r: %s ',
                        article['canonical_url'], e)
         return
     doc = Document()
     doc.add(StoredField('group_id', article['group_id']))
     doc.add(StoredField('article_id', article['article_id']))
     doc.add(
         StringField('date_published', date_published_str, Field.Store.YES))
     doc.add(
         SortedDocValuesField('date_published',
                              BytesRef(date_published_str)))
     doc.add(StoredField('date_published', date_published_str))
     doc.add(StringField('domain', article['domain'], Field.Store.YES))
     doc.add(StringField('site_type', article['site_type'],
                         Field.Store.YES))
     doc.add(
         TextField('canonical_url', article['canonical_url'],
                   Field.Store.YES))
     doc.add(TextField('title', article['title'], Field.Store.YES))
     doc.add(TextField('meta', article['meta'], Field.Store.NO))
     doc.add(TextField('content', article['content'], Field.Store.NO))
     doc.add(StoredField('uq_id_str', article['uq_id_str']))
     self.writer.addDocument(doc)

コード例 #5

0

ファイルを表示

	def getDoc(self, file):
		try:
			f = open(os.getcwd()+FILE_DIR+'/'+file, "r")

			try:
				c = []
				s = BeautifulSoup(f, 'html.parser')
				text = s.findAll(text=True)
				c = filter(tag_vis, text)
				try:
					c = ' '.join(c)
				except Exception as e:
					c = b' '.join(c)
			except Exception as e:
				print(str(e))
				return
			content = TextField("contents", c, Field.Store.YES)
			fileName = str(Paths.get(file)).split('/')[-1]
			fileName = fileName[:fileName.find(".")]
			filename = TextField("filename",
							 fileName,
							 Field.Store.YES)
			path = TextField("filepath",
						 str(os.getcwd()+FILE_DIR+'/'+file),
						 Field.Store.NO)
			doc = Document()
			doc.add(content)
			doc.add(filename)
			doc.add(path)
			return doc
		except Exception as e:
			print(type(Exception).__name__)
			print(str(e))
			return

コード例 #6

0

ファイルを表示

    def get_doc(self, filename, path, title, url, contents):
        '''
        Generate a `Document` according to the parameters.

        Input: `filename`: filename of the webpage
               `path`: path of the webpage
               `title`: title of the webpage
               `url`: original url of the webpage
               `contents`: contents of the webpage
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        # doc.add(Field("name", filename, self.property_type))
        # doc.add(Field("path", path, self.property_type))
        # doc.add(Field("title", title, self.property_type))
        # doc.add(Field("url", url, self.property_type))
        doc.add(StringField("name", filename, Field.Store.YES))
        doc.add(StringField("path", path, Field.Store.YES))
        doc.add(TextField("title", title, Field.Store.YES))
        doc.add(TextField("url", url, Field.Store.YES))
        if len(contents) > 0:
            # doc.add(Field("contents", contents, self.content_type))
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(filename))
        return doc

コード例 #7

0

ファイルを表示

ファイル: FacetExample.py プロジェクト: jessekafor/bisonlucene

    def index (cls, indexDir, taxoDir):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        from org.apache.lucene.util import Version
        config = IndexWriterConfig(Version.LUCENE_42,
                                   WhitespaceAnalyzer(Version.LUCENE_42))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE)
        # FacetFields is a utility class for adding facet fields to a document:
        facet_fields = FacetFields(taxo)

        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # obtain the sample facets for current document
            facets = categories[docNum]
            facetList = [CategoryPath(f) for f in facets]
            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
            #       Python list in order to to pass a proper argument to setCategoryPaths.
            #       We use java.util.Arrays (via JCC) to create a Java List:
            facetList = Arrays.asList(facetList)

            # NOTE: we could use lucene.collections here as well in order to convert our
            # Python list to a Java based list using the JavaList class (JavaList implements
            # java.util.List around a Python list instance it wraps):
            #  from lucene.collections import JavaList
            #  facetList = JavaList(facetList)

            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # use the FacetFields utility class for adding facet fields (i.e. the categories)
            # to the document (and, as required, to the taxonomy index)
            facet_fields.addFields(doc, facetList)
            # finally add the document to the index
            iw.addDocument(doc)
            nDocsAdded +=1
            nFacetsAdded += facetList.size()
        # end for

        # commit changes.
        # we commit changes to the taxonomy index prior to committing them to the search index.
        # this is important, so that all facets referred to by documents in the search index
        # will indeed exist in the taxonomy index.
        taxo.commit()
        iw.commit()

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        taxo.close()
        iw.close()
        print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)

コード例 #8

0

ファイルを表示

ファイル: linking.py プロジェクト: zzsfornlp/opera-TA1-linking

 def index(self, eid, name, cname, type, info):
     doc = Document()
     doc.add(TextField('id', eid, Field.Store.YES))
     doc.add(TextField('name', name, Field.Store.YES))
     doc.add(TextField('CannonicalName', cname, Field.Store.YES))
     doc.add(TextField('type', type, Field.Store.YES))
     doc.add(TextField('info', info, Field.Store.YES))
     self.writer.addDocument(doc)

コード例 #9

0

ファイルを表示

ファイル: pylucene-title-content-based.py プロジェクト: youshaox/automatic_fact_verification_8395

def create_document_by_document_sentence(org_title, preprocessed_title, doc_id,
                                         sentence):
    doc = Document()  # create a new document
    doc.add(StringField("org_title", org_title, Field.Store.YES))
    doc.add(
        TextField("preprocessed_title", preprocessed_title, Field.Store.YES))
    doc.add(StringField("doc_id", str(doc_id), Field.Store.YES))
    # doc.add(StringField("content", content, Field.Store.YES))
    doc.add(TextField("sentence", sentence, Field.Store.YES))
    return doc

コード例 #10

0

ファイルを表示

ファイル: indexer.py プロジェクト: Dnguy104/Twitter-Search

def index_files():
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    BASE_DIR = path.dirname(path.abspath(sys.argv[0]))
    INPUT_DIR = BASE_DIR + "/input/"
    INDEX_DIR = BASE_DIR + "/lucene_index/"

    NoT = 100000  # Number of Tokens
    print "------------------------------------------------------"
    print "PyLucene Demo started (lucene_demo.py)"
    print "Python version: %d.%d.%d" % (
        sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
    print 'Lucene version:', lucene.VERSION
    print "------------------------------------------------------\n"
    # lucene.initVM()

    # directory = RAMDirectory()
    index_path = Paths.get(INDEX_DIR)
    directory = SimpleFSDirectory(index_path)

    analyzer = StandardAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, NoT)
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, config)

    print "Number of indexed documents: %d\n" % writer.numDocs()
    for input_file in listdir(INPUT_DIR):  # iterate over all input files
        print "Current file:", input_file
        if input_file.endswith(".json"):
            with open(INPUT_DIR + input_file) as f:
                for line in f:
                    # doc = create_document(line, input_file) # call the create_document function
                    o = json.loads(line)
                    doc = Document()  # create a new document
                    doc.add(TextField("filename", input_file, Field.Store.YES))
                    # print file
                    doc.add(
                        TextField("username", o['user']['screen_name'],
                                  Field.Store.YES))
                    # print "username: "******"text", o['text'], Field.Store.YES))
                    # print "text: " + o['text']
                    if o['user']['location']:
                        doc.add(
                            TextField("location", o['user']['location'],
                                      Field.Store.YES))
                        # print "location: " + o['user']['location']
                    doc.add(TextField("time", o['created_at'],
                                      Field.Store.YES))
                    writer.addDocument(
                        doc)  # add the document to the IndexWriter
    print "\nNumber of indexed documents: %d" % writer.numDocs()
    writer.close()
    print "Finished\n"
    print "-----------------------------------------------------"

コード例 #11

0

ファイルを表示

ファイル: pylucene-title-content-based.py プロジェクト: youshaox/automatic_fact_verification_8395

def create_document_by_document_content(org_title, preprocessed_title,
                                        preprocessed_title_lower, content):
    doc = Document()  # create a new document
    doc.add(StringField("org_title", org_title, Field.Store.YES))
    doc.add(
        TextField("preprocessed_title", preprocessed_title, Field.Store.YES))
    doc.add(
        StringField("preprocessed_title_lower", preprocessed_title_lower,
                    Field.Store.YES))
    # doc.add(StringField("content", content, Field.Store.YES))
    doc.add(TextField("content", content, Field.Store.YES))
    return doc

コード例 #12

0

ファイルを表示

def createDocument_tweet(data):
    jsonText = data['text']  #accesses tweet
    jsonName = data['user']['screen_name']  #accesses username
    jsonLocation = data['coordinates']

    doc = Document()
    #added fields
    doc.add(TextField("tweet", jsonText, Field.Store.YES))
    doc.add(TextField("u_name", jsonName, Field.Store.YES))
    # doc.add(TextField("date", jsonDate, Field.Store.YES))

    # print jsonText
    return doc

コード例 #13

0

ファイルを表示

ファイル: UpdateIndex.py プロジェクト: paoxiaode/EE208-2019

    def testAdd(self, goodname, salenum, price, shopname, url, picturename, comment, historyprice):
        analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(self.dir, config)
        # True，建立新索引，False，建立增量索引

        noIndexedString = FieldType()
        noIndexedString.setTokenized(False)
        noIndexedString.setIndexed(False)
        noIndexedString.setStored(True)

        try:
            print "adding", goodname

            goodname_s = unicode(goodname, 'utf8')
            seg_list_good = jieba.cut(goodname_s, cut_all=False)
            goodname_s = " ".join(seg_list_good)  # 默认模式

            shopname_s = unicode(shopname, 'utf8')
            seg_list_shop = jieba.cut(shopname_s, cut_all=False)
            shopname_s = " ".join(seg_list_shop)  # 默认模式

            shopnameField = Field("shopName", shopname, noIndexedString)
            shopnameField_s = TextField("shopName_s", shopname_s, Field.Store.NO)
            goodnameField = Field("goodName", goodname, noIndexedString)
            goodnameField_s = TextField("goodName_s", goodname_s, Field.Store.NO)
            salenumField = IntField("saleNum", salenum, Field.Store.YES)
            priceField = DoubleField("price", price, Field.Store.YES)
            urlField = Field("url", url, noIndexedString)
            pictureField = StringField("pictureName", picturename, Field.Store.YES)
            commentField = Field("comments", comment, noIndexedString)
            historyPriceField = Field("historyPrice", historyprice, noIndexedString)

            doc = Document()
            doc.add(shopnameField)
            doc.add(shopnameField_s)
            doc.add(goodnameField)
            doc.add(goodnameField_s)
            doc.add(salenumField)
            doc.add(priceField)
            doc.add(urlField)
            doc.add(pictureField)
            doc.add(commentField)
            doc.add(historyPriceField)

            writer.addDocument(doc)
        except Exception, e:
            print "Failed in indexDocs:", e

コード例 #14

0

ファイルを表示

ファイル: zhihu_page_analyzer.py プロジェクト: 717297azbcgh/zhihu_stats

def obj_to_document(obj):
    def conv_to_str(x):
        if isinstance(x, unicode):
            return x.encode('utf8')
        return str(x)

    res = Document()
    res.add(StringField('index', conv_to_str(obj.index), Field.Store.YES))
    res.add(StringField('type', obj.__class__.__name__, Field.Store.YES))
    for k, v in vars(obj.data).items():
        if v is None:
            res.add(Field(k, '', Field.Store.YES, Field.Index.NO))
            fieldtype = LT_NONE
        elif isinstance(v, list):
            if len(v) > 0 and isinstance(v[0], int):
                res.add(
                    TextField(k, ' '.join((str(x) for x in set(v))),
                              Field.Store.YES))
                fieldtype = LT_INTLIST
            else:
                res.add(TextField(k, ' '.join(list(set(v))), Field.Store.YES))
                fieldtype = LT_LIST
        elif isinstance(v, str) or isinstance(v, unicode):
            res.add(Field(k, v, Field.Store.YES, Field.Index.NO))
            res.add(
                TextField(k + LTPF_FOR_QUERY,
                          ' '.join(jieba.lcut_for_search(v)), Field.Store.NO))
            fieldtype = LT_STRING
        elif isinstance(v, hyper_text):
            res.add(Field(k, v.raw, Field.Store.YES, Field.Index.NO))
            res.add(
                TextField(k + LTPF_FOR_QUERY,
                          ' '.join(jieba.lcut_for_search(v.text)),
                          Field.Store.NO))
            fieldtype = LT_HYPERTEXT
        elif isinstance(v, bool):
            if v:
                vs = '1'
            else:
                vs = '0'
            res.add(StringField(k, vs, Field.Store.YES))
            fieldtype = LT_BOOL
        elif isinstance(v, int) or isinstance(v, long):
            res.add(StringField(k, str(v), Field.Store.YES))
            fieldtype = LT_INT
        else:
            raise Exception('unrecognized data type')
        res.add(
            Field(k + LTPF_TYPE, fieldtype, Field.Store.YES, Field.Index.NO))
    return res

コード例 #15

0

ファイルを表示

    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()

コード例 #16

0

ファイルを表示

ファイル: build_para_uri_index.py プロジェクト: linxinshi/CUIS_TREC2018_CAR

def addDoc(w, data):
    doc = Document()
    #print ('----------------------------')
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        print ('field:%s  type:%s'%(field,type))
        print (value+'\n')
        '''
        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_DF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF))
        elif type == 'CUSTOM_FIELD_TEXT_BF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    w.addDocument(doc)

コード例 #17

0

ファイルを表示

ファイル: IndexFiles_photo.py プロジェクト: elfdown/ee208

    def indexDocs(self, root, writer):
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                print("adding", filename)
                path = os.path.join(root, filename)
                file = open(path, encoding='utf8')
                url = file.readline()
                title = file.readline()
                contents = file.read()
                file.close()
                img_url = self.getTxtAttribute(contents, 'img_url')
                img_info = self.getTxtAttribute(contents, 'img_info')
                for i in range(len(img_url)):
                    if len(img_info[i]) > 0:
                        title = title
                        doc = Document()

                        doc.add(StringField('title', title, Field.Store.YES))
                        doc.add(StringField('url', url, Field.Store.YES))
                        doc.add(
                            StringField('img_url', img_url[i],
                                        Field.Store.YES))
                        seg_contents = jieba.lcut_for_search(img_info[i])
                        contents = ' '.join(seg_contents)
                        doc.add(
                            TextField('contents', contents, Field.Store.YES))
                        writer.addDocument(doc)
                    else:
                        continue

コード例 #18

0

ファイルを表示

ファイル: create_category_corpus.py プロジェクト: linxinshi/EntityRetrievalPAS

def addDoc(w, data):
    doc = Document()
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        if type!='INTEGER_STORED':
           #print ('field=%s  len=%d'%(field,len(value)))
           print ('field=%s  value=%s'%(field,value))
        else:
           print ('field=%s  value=%d'%(field,value))
        '''

        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    try:
        w.addDocument(doc)
    except:
        #print ('error cat=%s'%(data['category'][0]))
        print('-----------------------------------')
        for field in data:
            value, type = data[field][0], data[field][1]
            print('field=%s\nvalue=%s' % (field, str(value)))

コード例 #19

0

ファイルを表示

    def indexDocs(self, root, writer):
        path = root + "/data/*/*.xml"
        # print(path)
        xml_files = glob.glob(path)
        # xml_files = ["HAM2-031201.xml"]
        numDocs = 0
        for xml in xml_files:
            try:
                parser = etree.XMLParser(recover=False, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            except etree.XMLSyntaxError as e:
                parser = etree.XMLParser(recover=True, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            root = tree.getroot()
            for text in root.iter("TEXT"):
                contents = "".join(text.xpath("text()")).strip()
                doc_no = text.getparent().find("DOCNO").text
                # print("adding", doc_no)
                try:
                    doc = Document()
                    doc.add(StringField("id", doc_no, Field.Store.YES))
                    if len(contents) > 0:
                        doc.add(
                            TextField("contents", contents, Field.Store.YES))
                    else:
                        pass
                        # print("warning: no content in %s" % doc_no)
                    writer.addDocument(doc)
                    numDocs += 1
                except Exception as e:
                    print("Failed in indexDocs:", e)
        return numDocs

コード例 #20

0

ファイルを表示

ファイル: search_pylucene.py プロジェクト: skluckova/VINF

def addDoc(w, name, birth_date, death_date, birth_note, death_note):
    doc = Document()
    doc.add(TextField("name", name, Field.Store.YES))
    doc.add(StringField("birth_date", birth_date, Field.Store.YES))
    doc.add(StringField("death_date", death_date, Field.Store.YES))
    doc.add(StringField("birth_note", birth_note, Field.Store.YES))
    doc.add(StringField("death_note", death_note, Field.Store.YES))
    w.addDocument(doc)

コード例 #21

0

ファイルを表示

ファイル: pylucene.py プロジェクト: TharunMohandoss/All_Assignments

def create_document(file_name):
    path = './alldocs/' + file_name
    file = open(path)
    doc = Document()
    doc.add(StringField("title", input_file, Field.Store.YES))
    doc.add(TextField("text", file.read(), Field.Store.YES))
    file.close()
    return doc

コード例 #22

0

ファイルを表示

ファイル: index.py プロジェクト: davepie101/Twitter-Crawler

def create_doc(data):
    screen_name = data['screen_name']
    tweet = data['tweet']
    tweet_date = data['tweet_date']
    tweet_location = data['tweet_location']
    page_title = data['page_title']

    doc = Document()
    doc.add(TextField("username", screen_name, Field.Store.YES))
    doc.add(TextField("text", tweet, Field.Store.YES))
    doc.add(TextField("date", tweet_date, Field.Store.YES))
    if tweet_location:
        doc.add(TextField("location", tweet_location, Field.Store.YES))
    if page_title:
        doc.add(TextField("page title", page_title, Field.Store.YES))

    return doc

コード例 #23

0

ファイルを表示

 def index_document(self, wiki_doc):
     """
     :param wiki_doc: the document to be indexed.
     :return:
     """
     # Method that indexes documents
     i = 0
     for section in wiki_doc.sections:
         doc = Document()
         doc.add(StringField("id_article", wiki_doc.id, Field.Store.YES))
         doc.add(TextField("title_article", wiki_doc.title, Field.Store.YES))
         doc.add(StringField("id_section", str(
             wiki_doc.id) + "_" + str(i), Field.Store.YES))
         doc.add(TextField("title_section", section.title, Field.Store.YES))
         doc.add(TextField("content_section", section.text, Field.Store.YES))
         self.writer.addDocument(doc)
         i += 1

コード例 #24

0

ファイルを表示

ファイル: incremental.py プロジェクト: shubhampachori12110095/QA-Clustering

 def addDocument(self, id):
     global answers_train
     preA = answers_train[id]
     doc = Document()
     doc.add(TextField("pa", preA, Field.Store.YES))
     doc.add(StringField("id", str(id), Field.Store.YES))
     self.w.addDocument(doc)
     self.w.commit()

コード例 #25

0

ファイルを表示

def addDoc(w, doc_name, text, file_name):
    """
    add single doc to the index
    :param w: writer
    :param doc_name:
    :param text:
    :param file_name:
    :return:
    """
    doc = Document()
    # TextField: sequence of terms: tokenized
    doc.add(TextField("text", text, Field.Store.YES))
    # StringField: character strings with all punctuation, spacing, and case preserved.
    doc.add(TextField('doc_name', doc_name, Field.Store.YES))
    #doc.add(StringField('corpus_name', file_name, Field.Store.YES))

    doc.add(TextField('corpus_name', file_name, Field.Store.YES))
    w.addDocument(doc)

コード例 #26

0

ファイルを表示

ファイル: lucene_demo.py プロジェクト: yasmineTYM/PyLucene-Demo

def create_document(file_name):
    path = INPUT_DIR + file_name  # assemble the file descriptor
    file = open(path)  # open in read mode
    doc = Document()  # create a new document
    # add the title field
    doc.add(StringField("title", input_file, Field.Store.YES))
    # add the whole book
    doc.add(TextField("text", file.read(), Field.Store.YES))
    file.close()  # close the file pointer
    return doc

コード例 #27

0

ファイルを表示

    def get_doc(self, doc_info, contents):
        '''
        Generate a `Document` according to the given info.

        Input: `doc_info`: info of the doc (`name`, `path`, `title`, `url`, `site`)
               `contents`: contents of the webpage
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        doc.add(StringField("name", doc_info['name'], Field.Store.YES))
        doc.add(StringField("path", doc_info['path'], Field.Store.YES))
        doc.add(StringField("title", doc_info['title'], Field.Store.YES))
        doc.add(StringField("url", doc_info['url'], Field.Store.YES))
        doc.add(TextField("site", doc_info['site'], Field.Store.YES))
        if len(contents) > 0:
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(doc_info['name']))
        return doc

コード例 #28

0

ファイルを表示

def retrival_answer(MAX):
    lucene.initVM()
    directory = RAMDirectory()

    indexDir = SimpleFSDirectory(Paths.get('index'))
    writerConfig = IndexWriterConfig(StandardAnalyzer())
    writer = IndexWriter(directory, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading lines from Document..."

    process_doc = open("Huawei_result/document.txt", "r")
    doc_line = process_doc.readlines()
    for l in doc_line:
        doc = Document()
        doc.add(TextField("text", l, Field.Store.YES))
        writer.addDocument(doc)
    print "Indexed from %d docs in index" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

    accuracy = []
    process_query = open("Huawei_result/query.txt", "r")
    query_line = process_query.readlines()
    for n, one_query in enumerate(query_line):
        analyzer = StandardAnalyzer()
        # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index')))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        # searcher = IndexSearcher(reader)
        query = QueryParser("text", analyzer).parse(one_query)
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        # print "The groundtruth document is:", doc_line[n]
        candidate_doc = []
        for hit in hits.scoreDocs:
            # print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            # print doc.get("text").encode("utf-8")
            candidate_doc.append(doc.get("text"))

        choices = process.extract(unicode(doc_line[n]), candidate_doc)
        flag = 0
        for i in range(len(choices)):
            if choices[i][1] >= 89:
                flag = 1
        if flag == 1:
            accuracy.append(1)
        else:
            accuracy.append(0)

    final_accuracy = float(sum(accuracy)) / float(len(accuracy))

    print "the final accuracy is:", final_accuracy

コード例 #29

0

ファイルを表示

    def get_doc(self, img):
        '''
        Generate a `Document` according to the parameters.

        Input: `img`: dict containing a single image info
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        doc.add(StringField("img_url", img['img_url'], Field.Store.YES))
        doc.add(TextField("description", img['description'], Field.Store.YES))
        doc.add(StringField("url", img['url'], Field.Store.YES))
        doc.add(StringField("url_title", img['url_title'], Field.Store.YES))
        return doc

コード例 #30

0

ファイルを表示

def addDoc(w, text):
    """
    add single doc to the index
    :param w: writer
    :param doc_name:
    :param text:
    :param file_name:
    :return:
    """
    doc = Document()
    # TextField: sequence of terms: tokenized
    doc.add(TextField("text", text, Field.Store.YES))
    w.addDocument(doc)