Esempi in Python per TextField.TextField, esempi in Python per org.apache.lucene.document.TextField.TextField

Esempio n. 1

0

Mostra file

 def index_single_file(self, doc_file):
     logger.info("adding {}".format(doc_file))
     lucene_doc_num = 0
     try:
         with open(doc_file) as df:
             for line in df:
                 wiki_doc = json.loads(line)
                 doc_title = wiki_doc['title']
                 doc_text = wiki_doc['plaintext']
                 doc_id = wiki_doc['_id']
                 paragraphs = doc_text.split('\n\n')
                 if len(paragraphs) < 3:
                     continue
                 doc_text = rm_special_chars(doc_text)
                 doc = Document()
                 doc.add(StringField("id", str(doc_id), Field.Store.YES))
                 doc.add(TextField("title", doc_title, Field.Store.YES))
                 doc.add(TextField("text", doc_text, Field.Store.YES))
                 self.writer.addDocument(doc)
                 lucene_doc_num += 1
                 if lucene_doc_num % 10000 == 0:
                     logger.info('added {} lucene docs'.format(lucene_doc_num))
     except Exception as e:
         import traceback
         traceback.print_tb(e.__traceback__)
         logger.error("Failed in: {}".format(doc_file))
     return lucene_doc_num

Esempio n. 2

0

Mostra file

 def get_document(self, title, body, tags, date):
     doc = Document()
     doc.add(TextField(self.INDEX_FILED_TITLE, title, Field.Store.YES))
     doc.add(TextField(self.INDEX_FILED_BODY, body, Field.Store.YES))
     doc.add(TextField(self.INDEX_FILED_TAGS, tags, Field.Store.YES))
     doc.add(TextField(self.INDEX_FILED_DATE, date, Field.Store.YES))
     return doc

Esempio n. 3

0

Mostra file

File: IndexFiles.py Progetto: elfdown/ee208

    def indexDocs(self, root, writer):   
        for root,dirnames,filenames in os.walk(root):
            for dirname in dirnames: #遍历文件夹
                path1 = os.path.join(root,dirname)
                for trivial1 , trivial2 , filenames in os.walk(path1): #遍历文件夹下的文件
                    for filename in filenames:
                        #print(root,dirnames,filename)
                        print("adding", filename)
                        # try:
                        path = os.path.join(path1, filename)
                        file = open(path, encoding='utf8')
                        page = file.readline()
                        title = file.readline()
                        contents = file.read()
                        file.close()

                        # jieba 分词
                        seg_contents = jieba.lcut_for_search(contents)
                        contents = ' '.join(seg_contents)
                        url = page
                        seg_url = jieba.lcut_for_search(page)
                        page = ' '.join(list(set(seg_url)-set(['.','http','https','/',':','?','=','html','shtml','www'])))

                        doc = Document()
                        doc.add(StringField("name", filename, Field.Store.YES))
                        doc.add(StringField("path", path, Field.Store.YES))
                        if len(contents) > 0:
                            doc.add(TextField('title', title, Field.Store.YES))
                            doc.add(TextField('site', page, Field.Store.YES))
                            doc.add(TextField('url',url,Field.Store.YES))
                            doc.add(TextField('contents', contents, Field.Store.YES))
                        else:
                            print("warning: no content in %s" % filename)
                        writer.addDocument(doc)

Esempio n. 4

0

Mostra file

File: index.py Progetto: zjcom/hoaxy-backend

 def index_one(self, article):
     """Create index for one url object in the database.
     """
     try:
         date_published_str = article['date_published'].strftime(
             self.date_format)
     except Exception as e:
         logger.warning('Error when formating date_published %r: %s ',
                        article['canonical_url'], e)
         return
     doc = Document()
     doc.add(StoredField('group_id', article['group_id']))
     doc.add(StoredField('article_id', article['article_id']))
     doc.add(
         StringField('date_published', date_published_str, Field.Store.YES))
     doc.add(
         SortedDocValuesField('date_published',
                              BytesRef(date_published_str)))
     doc.add(StoredField('date_published', date_published_str))
     doc.add(StringField('domain', article['domain'], Field.Store.YES))
     doc.add(StringField('site_type', article['site_type'],
                         Field.Store.YES))
     doc.add(
         TextField('canonical_url', article['canonical_url'],
                   Field.Store.YES))
     doc.add(TextField('title', article['title'], Field.Store.YES))
     doc.add(TextField('meta', article['meta'], Field.Store.NO))
     doc.add(TextField('content', article['content'], Field.Store.NO))
     doc.add(StoredField('uq_id_str', article['uq_id_str']))
     self.writer.addDocument(doc)

Esempio n. 5

0

Mostra file

	def getDoc(self, file):
		try:
			f = open(os.getcwd()+FILE_DIR+'/'+file, "r")

			try:
				c = []
				s = BeautifulSoup(f, 'html.parser')
				text = s.findAll(text=True)
				c = filter(tag_vis, text)
				try:
					c = ' '.join(c)
				except Exception as e:
					c = b' '.join(c)
			except Exception as e:
				print(str(e))
				return
			content = TextField("contents", c, Field.Store.YES)
			fileName = str(Paths.get(file)).split('/')[-1]
			fileName = fileName[:fileName.find(".")]
			filename = TextField("filename",
							 fileName,
							 Field.Store.YES)
			path = TextField("filepath",
						 str(os.getcwd()+FILE_DIR+'/'+file),
						 Field.Store.NO)
			doc = Document()
			doc.add(content)
			doc.add(filename)
			doc.add(path)
			return doc
		except Exception as e:
			print(type(Exception).__name__)
			print(str(e))
			return

Esempio n. 6

0

Mostra file

    def get_doc(self, filename, path, title, url, contents):
        '''
        Generate a `Document` according to the parameters.

        Input: `filename`: filename of the webpage
               `path`: path of the webpage
               `title`: title of the webpage
               `url`: original url of the webpage
               `contents`: contents of the webpage
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        # doc.add(Field("name", filename, self.property_type))
        # doc.add(Field("path", path, self.property_type))
        # doc.add(Field("title", title, self.property_type))
        # doc.add(Field("url", url, self.property_type))
        doc.add(StringField("name", filename, Field.Store.YES))
        doc.add(StringField("path", path, Field.Store.YES))
        doc.add(TextField("title", title, Field.Store.YES))
        doc.add(TextField("url", url, Field.Store.YES))
        if len(contents) > 0:
            # doc.add(Field("contents", contents, self.content_type))
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(filename))
        return doc

Esempio n. 7

0

Mostra file

File: FacetExample.py Progetto: jessekafor/bisonlucene

    def index (cls, indexDir, taxoDir):
        """Create an index, and adds to it sample documents and facets.
        indexDir Directory in which the index should be created.
        taxoDir Directory in which the taxonomy index should be created.
        """
        # create and open an index writer
        from org.apache.lucene.util import Version
        config = IndexWriterConfig(Version.LUCENE_42,
                                   WhitespaceAnalyzer(Version.LUCENE_42))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        iw = IndexWriter(indexDir, config)
        # create and open a taxonomy writer
        taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE)
        # FacetFields is a utility class for adding facet fields to a document:
        facet_fields = FacetFields(taxo)

        # loop over sample documents
        nDocsAdded = 0
        nFacetsAdded = 0
        for docNum in range(len(docTexts)):
            # obtain the sample facets for current document
            facets = categories[docNum]
            facetList = [CategoryPath(f) for f in facets]
            # NOTE: setCategoryPaths() requires an Iterable, so need to convert the
            #       Python list in order to to pass a proper argument to setCategoryPaths.
            #       We use java.util.Arrays (via JCC) to create a Java List:
            facetList = Arrays.asList(facetList)

            # NOTE: we could use lucene.collections here as well in order to convert our
            # Python list to a Java based list using the JavaList class (JavaList implements
            # java.util.List around a Python list instance it wraps):
            #  from lucene.collections import JavaList
            #  facetList = JavaList(facetList)

            # create a plain Lucene document and add some regular Lucene fields to it
            doc = Document()
            doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES))
            doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO))
            # use the FacetFields utility class for adding facet fields (i.e. the categories)
            # to the document (and, as required, to the taxonomy index)
            facet_fields.addFields(doc, facetList)
            # finally add the document to the index
            iw.addDocument(doc)
            nDocsAdded +=1
            nFacetsAdded += facetList.size()
        # end for

        # commit changes.
        # we commit changes to the taxonomy index prior to committing them to the search index.
        # this is important, so that all facets referred to by documents in the search index
        # will indeed exist in the taxonomy index.
        taxo.commit()
        iw.commit()

        # close the taxonomy index and the index - all modifications are
        # now safely in the provided directories: indexDir and taxoDir.
        taxo.close()
        iw.close()
        print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)

Esempio n. 8

0

Mostra file

File: linking.py Progetto: zzsfornlp/opera-TA1-linking

 def index(self, eid, name, cname, type, info):
     doc = Document()
     doc.add(TextField('id', eid, Field.Store.YES))
     doc.add(TextField('name', name, Field.Store.YES))
     doc.add(TextField('CannonicalName', cname, Field.Store.YES))
     doc.add(TextField('type', type, Field.Store.YES))
     doc.add(TextField('info', info, Field.Store.YES))
     self.writer.addDocument(doc)

Esempio n. 9

0

Mostra file

File: pylucene-title-content-based.py Progetto: youshaox/automatic_fact_verification_8395

def create_document_by_document_sentence(org_title, preprocessed_title, doc_id,
                                         sentence):
    doc = Document()  # create a new document
    doc.add(StringField("org_title", org_title, Field.Store.YES))
    doc.add(
        TextField("preprocessed_title", preprocessed_title, Field.Store.YES))
    doc.add(StringField("doc_id", str(doc_id), Field.Store.YES))
    # doc.add(StringField("content", content, Field.Store.YES))
    doc.add(TextField("sentence", sentence, Field.Store.YES))
    return doc

Esempio n. 10

0

Mostra file

File: indexer.py Progetto: Dnguy104/Twitter-Search

def index_files():
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    BASE_DIR = path.dirname(path.abspath(sys.argv[0]))
    INPUT_DIR = BASE_DIR + "/input/"
    INDEX_DIR = BASE_DIR + "/lucene_index/"

    NoT = 100000  # Number of Tokens
    print "------------------------------------------------------"
    print "PyLucene Demo started (lucene_demo.py)"
    print "Python version: %d.%d.%d" % (
        sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
    print 'Lucene version:', lucene.VERSION
    print "------------------------------------------------------\n"
    # lucene.initVM()

    # directory = RAMDirectory()
    index_path = Paths.get(INDEX_DIR)
    directory = SimpleFSDirectory(index_path)

    analyzer = StandardAnalyzer()
    analyzer = LimitTokenCountAnalyzer(analyzer, NoT)
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, config)

    print "Number of indexed documents: %d\n" % writer.numDocs()
    for input_file in listdir(INPUT_DIR):  # iterate over all input files
        print "Current file:", input_file
        if input_file.endswith(".json"):
            with open(INPUT_DIR + input_file) as f:
                for line in f:
                    # doc = create_document(line, input_file) # call the create_document function
                    o = json.loads(line)
                    doc = Document()  # create a new document
                    doc.add(TextField("filename", input_file, Field.Store.YES))
                    # print file
                    doc.add(
                        TextField("username", o['user']['screen_name'],
                                  Field.Store.YES))
                    # print "username: "******"text", o['text'], Field.Store.YES))
                    # print "text: " + o['text']
                    if o['user']['location']:
                        doc.add(
                            TextField("location", o['user']['location'],
                                      Field.Store.YES))
                        # print "location: " + o['user']['location']
                    doc.add(TextField("time", o['created_at'],
                                      Field.Store.YES))
                    writer.addDocument(
                        doc)  # add the document to the IndexWriter
    print "\nNumber of indexed documents: %d" % writer.numDocs()
    writer.close()
    print "Finished\n"
    print "-----------------------------------------------------"

Esempio n. 11

0

Mostra file

File: pylucene-title-content-based.py Progetto: youshaox/automatic_fact_verification_8395

def create_document_by_document_content(org_title, preprocessed_title,
                                        preprocessed_title_lower, content):
    doc = Document()  # create a new document
    doc.add(StringField("org_title", org_title, Field.Store.YES))
    doc.add(
        TextField("preprocessed_title", preprocessed_title, Field.Store.YES))
    doc.add(
        StringField("preprocessed_title_lower", preprocessed_title_lower,
                    Field.Store.YES))
    # doc.add(StringField("content", content, Field.Store.YES))
    doc.add(TextField("content", content, Field.Store.YES))
    return doc

Esempio n. 12

0

Mostra file

def createDocument_tweet(data):
    jsonText = data['text']  #accesses tweet
    jsonName = data['user']['screen_name']  #accesses username
    jsonLocation = data['coordinates']

    doc = Document()
    #added fields
    doc.add(TextField("tweet", jsonText, Field.Store.YES))
    doc.add(TextField("u_name", jsonName, Field.Store.YES))
    # doc.add(TextField("date", jsonDate, Field.Store.YES))

    # print jsonText
    return doc

Esempio n. 13

0

Mostra file

File: UpdateIndex.py Progetto: paoxiaode/EE208-2019

    def testAdd(self, goodname, salenum, price, shopname, url, picturename, comment, historyprice):
        analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(self.dir, config)
        # True，建立新索引，False，建立增量索引

        noIndexedString = FieldType()
        noIndexedString.setTokenized(False)
        noIndexedString.setIndexed(False)
        noIndexedString.setStored(True)

        try:
            print "adding", goodname

            goodname_s = unicode(goodname, 'utf8')
            seg_list_good = jieba.cut(goodname_s, cut_all=False)
            goodname_s = " ".join(seg_list_good)  # 默认模式

            shopname_s = unicode(shopname, 'utf8')
            seg_list_shop = jieba.cut(shopname_s, cut_all=False)
            shopname_s = " ".join(seg_list_shop)  # 默认模式

            shopnameField = Field("shopName", shopname, noIndexedString)
            shopnameField_s = TextField("shopName_s", shopname_s, Field.Store.NO)
            goodnameField = Field("goodName", goodname, noIndexedString)
            goodnameField_s = TextField("goodName_s", goodname_s, Field.Store.NO)
            salenumField = IntField("saleNum", salenum, Field.Store.YES)
            priceField = DoubleField("price", price, Field.Store.YES)
            urlField = Field("url", url, noIndexedString)
            pictureField = StringField("pictureName", picturename, Field.Store.YES)
            commentField = Field("comments", comment, noIndexedString)
            historyPriceField = Field("historyPrice", historyprice, noIndexedString)

            doc = Document()
            doc.add(shopnameField)
            doc.add(shopnameField_s)
            doc.add(goodnameField)
            doc.add(goodnameField_s)
            doc.add(salenumField)
            doc.add(priceField)
            doc.add(urlField)
            doc.add(pictureField)
            doc.add(commentField)
            doc.add(historyPriceField)

            writer.addDocument(doc)
        except Exception, e:
            print "Failed in indexDocs:", e

Esempio n. 14

0

Mostra file

File: zhihu_page_analyzer.py Progetto: 717297azbcgh/zhihu_stats

def obj_to_document(obj):
    def conv_to_str(x):
        if isinstance(x, unicode):
            return x.encode('utf8')
        return str(x)

    res = Document()
    res.add(StringField('index', conv_to_str(obj.index), Field.Store.YES))
    res.add(StringField('type', obj.__class__.__name__, Field.Store.YES))
    for k, v in vars(obj.data).items():
        if v is None:
            res.add(Field(k, '', Field.Store.YES, Field.Index.NO))
            fieldtype = LT_NONE
        elif isinstance(v, list):
            if len(v) > 0 and isinstance(v[0], int):
                res.add(
                    TextField(k, ' '.join((str(x) for x in set(v))),
                              Field.Store.YES))
                fieldtype = LT_INTLIST
            else:
                res.add(TextField(k, ' '.join(list(set(v))), Field.Store.YES))
                fieldtype = LT_LIST
        elif isinstance(v, str) or isinstance(v, unicode):
            res.add(Field(k, v, Field.Store.YES, Field.Index.NO))
            res.add(
                TextField(k + LTPF_FOR_QUERY,
                          ' '.join(jieba.lcut_for_search(v)), Field.Store.NO))
            fieldtype = LT_STRING
        elif isinstance(v, hyper_text):
            res.add(Field(k, v.raw, Field.Store.YES, Field.Index.NO))
            res.add(
                TextField(k + LTPF_FOR_QUERY,
                          ' '.join(jieba.lcut_for_search(v.text)),
                          Field.Store.NO))
            fieldtype = LT_HYPERTEXT
        elif isinstance(v, bool):
            if v:
                vs = '1'
            else:
                vs = '0'
            res.add(StringField(k, vs, Field.Store.YES))
            fieldtype = LT_BOOL
        elif isinstance(v, int) or isinstance(v, long):
            res.add(StringField(k, str(v), Field.Store.YES))
            fieldtype = LT_INT
        else:
            raise Exception('unrecognized data type')
        res.add(
            Field(k + LTPF_TYPE, fieldtype, Field.Store.YES, Field.Index.NO))
    return res

Esempio n. 15

0

Mostra file

    def IndexDocs(self, documents):
        """
        Index documents under the directory

        :Parameters:
        - `documents`: Documents to be indexed (List)
        """
        # Get the Writer Configuration
        writerConfig = IndexWriterConfig(self.__analyzer)
        # Get index writer
        writer = IndexWriter(self.__indexDir, writerConfig)

        for document in documents:
            # Create a document that would we added to the index
            doc = Document()
            # Add a field to this document
            doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES))
            doc.add(
                Field(Indexer.CONTENT, document['content'],
                      self.__contentType))
            doc.add(
                StringField(Indexer.DATE, document['date'], Field.Store.YES))
            doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES))
            doc.add(
                TextField(Indexer.TAGS, self.__qualifyTags(document['tags']),
                          Field.Store.YES))
            doc.add(
                LongPoint(Indexer.TIMESTAMP,
                          self.__getTimestamp(document['date'])))
            # Add or update the document to the index
            if not self.__boAppend:
                # New index, so we just add the document (no old document can be there):
                if self.__verbose:
                    print("Adding " + document['name'])
                writer.addDocument(doc)
            else:
                # Existing index (an old copy of this document may have been indexed) so
                # we use updateDocument instead to replace the old one matching the exact
                # path, if present:
                if self.__verbose:
                    print("Updating " + document['name'])
                writer.updateDocument(Term(Indexer.NAME, document['name']),
                                      doc)

        # Print index information and close writer
        print("Indexed %d documents (%d docs in index)" %
              (len(documents), writer.numDocs()))
        writer.close()

Esempio n. 16

0

Mostra file

File: build_para_uri_index.py Progetto: linxinshi/CUIS_TREC2018_CAR

def addDoc(w, data):
    doc = Document()
    #print ('----------------------------')
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        print ('field:%s  type:%s'%(field,type))
        print (value+'\n')
        '''
        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_DF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF))
        elif type == 'CUSTOM_FIELD_TEXT_BF':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    w.addDocument(doc)

Esempio n. 17

0

Mostra file

File: IndexFiles_photo.py Progetto: elfdown/ee208

    def indexDocs(self, root, writer):
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                print("adding", filename)
                path = os.path.join(root, filename)
                file = open(path, encoding='utf8')
                url = file.readline()
                title = file.readline()
                contents = file.read()
                file.close()
                img_url = self.getTxtAttribute(contents, 'img_url')
                img_info = self.getTxtAttribute(contents, 'img_info')
                for i in range(len(img_url)):
                    if len(img_info[i]) > 0:
                        title = title
                        doc = Document()

                        doc.add(StringField('title', title, Field.Store.YES))
                        doc.add(StringField('url', url, Field.Store.YES))
                        doc.add(
                            StringField('img_url', img_url[i],
                                        Field.Store.YES))
                        seg_contents = jieba.lcut_for_search(img_info[i])
                        contents = ' '.join(seg_contents)
                        doc.add(
                            TextField('contents', contents, Field.Store.YES))
                        writer.addDocument(doc)
                    else:
                        continue

Esempio n. 18

0

Mostra file

File: create_category_corpus.py Progetto: linxinshi/EntityRetrievalPAS

def addDoc(w, data):
    doc = Document()
    for field in data:
        value, type = data[field][0], data[field][1]
        '''
        if type!='INTEGER_STORED':
           #print ('field=%s  len=%d'%(field,len(value)))
           print ('field=%s  value=%s'%(field,value))
        else:
           print ('field=%s  value=%d'%(field,value))
        '''

        if type == 'StringField':
            doc.add(StringField(field, value, Field.Store.YES))
        elif type == 'TextField':
            doc.add(TextField(field, value, Field.Store.YES))
        elif type == 'CUSTOM_FIELD_TEXT':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT))
        elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED':
            doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED))
        elif type == 'INTEGER_STORED':
            doc.add(StoredField(field, value))
        else:
            print('UNKNOWN FIELD')

    try:
        w.addDocument(doc)
    except:
        #print ('error cat=%s'%(data['category'][0]))
        print('-----------------------------------')
        for field in data:
            value, type = data[field][0], data[field][1]
            print('field=%s\nvalue=%s' % (field, str(value)))

Esempio n. 19

0

Mostra file

    def indexDocs(self, root, writer):
        path = root + "/data/*/*.xml"
        # print(path)
        xml_files = glob.glob(path)
        # xml_files = ["HAM2-031201.xml"]
        numDocs = 0
        for xml in xml_files:
            try:
                parser = etree.XMLParser(recover=False, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            except etree.XMLSyntaxError as e:
                parser = etree.XMLParser(recover=True, strip_cdata=False)
                tree = etree.parse(xml, parser=parser)

            root = tree.getroot()
            for text in root.iter("TEXT"):
                contents = "".join(text.xpath("text()")).strip()
                doc_no = text.getparent().find("DOCNO").text
                # print("adding", doc_no)
                try:
                    doc = Document()
                    doc.add(StringField("id", doc_no, Field.Store.YES))
                    if len(contents) > 0:
                        doc.add(
                            TextField("contents", contents, Field.Store.YES))
                    else:
                        pass
                        # print("warning: no content in %s" % doc_no)
                    writer.addDocument(doc)
                    numDocs += 1
                except Exception as e:
                    print("Failed in indexDocs:", e)
        return numDocs

Esempio n. 20

0

Mostra file

File: search_pylucene.py Progetto: skluckova/VINF

def addDoc(w, name, birth_date, death_date, birth_note, death_note):
    doc = Document()
    doc.add(TextField("name", name, Field.Store.YES))
    doc.add(StringField("birth_date", birth_date, Field.Store.YES))
    doc.add(StringField("death_date", death_date, Field.Store.YES))
    doc.add(StringField("birth_note", birth_note, Field.Store.YES))
    doc.add(StringField("death_note", death_note, Field.Store.YES))
    w.addDocument(doc)

Esempio n. 21

0

Mostra file

File: pylucene.py Progetto: TharunMohandoss/All_Assignments

def create_document(file_name):
    path = './alldocs/' + file_name
    file = open(path)
    doc = Document()
    doc.add(StringField("title", input_file, Field.Store.YES))
    doc.add(TextField("text", file.read(), Field.Store.YES))
    file.close()
    return doc

Esempio n. 22

0

Mostra file

File: index.py Progetto: davepie101/Twitter-Crawler

def create_doc(data):
    screen_name = data['screen_name']
    tweet = data['tweet']
    tweet_date = data['tweet_date']
    tweet_location = data['tweet_location']
    page_title = data['page_title']

    doc = Document()
    doc.add(TextField("username", screen_name, Field.Store.YES))
    doc.add(TextField("text", tweet, Field.Store.YES))
    doc.add(TextField("date", tweet_date, Field.Store.YES))
    if tweet_location:
        doc.add(TextField("location", tweet_location, Field.Store.YES))
    if page_title:
        doc.add(TextField("page title", page_title, Field.Store.YES))

    return doc

Esempio n. 23

0

Mostra file

 def index_document(self, wiki_doc):
     """
     :param wiki_doc: the document to be indexed.
     :return:
     """
     # Method that indexes documents
     i = 0
     for section in wiki_doc.sections:
         doc = Document()
         doc.add(StringField("id_article", wiki_doc.id, Field.Store.YES))
         doc.add(TextField("title_article", wiki_doc.title, Field.Store.YES))
         doc.add(StringField("id_section", str(
             wiki_doc.id) + "_" + str(i), Field.Store.YES))
         doc.add(TextField("title_section", section.title, Field.Store.YES))
         doc.add(TextField("content_section", section.text, Field.Store.YES))
         self.writer.addDocument(doc)
         i += 1

Esempio n. 24

0

Mostra file

File: incremental.py Progetto: shubhampachori12110095/QA-Clustering

 def addDocument(self, id):
     global answers_train
     preA = answers_train[id]
     doc = Document()
     doc.add(TextField("pa", preA, Field.Store.YES))
     doc.add(StringField("id", str(id), Field.Store.YES))
     self.w.addDocument(doc)
     self.w.commit()

Esempio n. 25

0

Mostra file

def addDoc(w, doc_name, text, file_name):
    """
    add single doc to the index
    :param w: writer
    :param doc_name:
    :param text:
    :param file_name:
    :return:
    """
    doc = Document()
    # TextField: sequence of terms: tokenized
    doc.add(TextField("text", text, Field.Store.YES))
    # StringField: character strings with all punctuation, spacing, and case preserved.
    doc.add(TextField('doc_name', doc_name, Field.Store.YES))
    #doc.add(StringField('corpus_name', file_name, Field.Store.YES))

    doc.add(TextField('corpus_name', file_name, Field.Store.YES))
    w.addDocument(doc)

Esempio n. 26

0

Mostra file

File: lucene_demo.py Progetto: yasmineTYM/PyLucene-Demo

def create_document(file_name):
    path = INPUT_DIR + file_name  # assemble the file descriptor
    file = open(path)  # open in read mode
    doc = Document()  # create a new document
    # add the title field
    doc.add(StringField("title", input_file, Field.Store.YES))
    # add the whole book
    doc.add(TextField("text", file.read(), Field.Store.YES))
    file.close()  # close the file pointer
    return doc

Esempio n. 27

0

Mostra file

    def get_doc(self, doc_info, contents):
        '''
        Generate a `Document` according to the given info.

        Input: `doc_info`: info of the doc (`name`, `path`, `title`, `url`, `site`)
               `contents`: contents of the webpage
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        doc.add(StringField("name", doc_info['name'], Field.Store.YES))
        doc.add(StringField("path", doc_info['path'], Field.Store.YES))
        doc.add(StringField("title", doc_info['title'], Field.Store.YES))
        doc.add(StringField("url", doc_info['url'], Field.Store.YES))
        doc.add(TextField("site", doc_info['site'], Field.Store.YES))
        if len(contents) > 0:
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(doc_info['name']))
        return doc

Esempio n. 28

0

Mostra file

def retrival_answer(MAX):
    lucene.initVM()
    directory = RAMDirectory()

    indexDir = SimpleFSDirectory(Paths.get('index'))
    writerConfig = IndexWriterConfig(StandardAnalyzer())
    writer = IndexWriter(directory, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading lines from Document..."

    process_doc = open("Huawei_result/document.txt", "r")
    doc_line = process_doc.readlines()
    for l in doc_line:
        doc = Document()
        doc.add(TextField("text", l, Field.Store.YES))
        writer.addDocument(doc)
    print "Indexed from %d docs in index" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

    accuracy = []
    process_query = open("Huawei_result/query.txt", "r")
    query_line = process_query.readlines()
    for n, one_query in enumerate(query_line):
        analyzer = StandardAnalyzer()
        # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index')))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        # searcher = IndexSearcher(reader)
        query = QueryParser("text", analyzer).parse(one_query)
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        # print "The groundtruth document is:", doc_line[n]
        candidate_doc = []
        for hit in hits.scoreDocs:
            # print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            # print doc.get("text").encode("utf-8")
            candidate_doc.append(doc.get("text"))

        choices = process.extract(unicode(doc_line[n]), candidate_doc)
        flag = 0
        for i in range(len(choices)):
            if choices[i][1] >= 89:
                flag = 1
        if flag == 1:
            accuracy.append(1)
        else:
            accuracy.append(0)

    final_accuracy = float(sum(accuracy)) / float(len(accuracy))

    print "the final accuracy is:", final_accuracy

Esempio n. 29

0

Mostra file

    def get_doc(self, img):
        '''
        Generate a `Document` according to the parameters.

        Input: `img`: dict containing a single image info
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        doc.add(StringField("img_url", img['img_url'], Field.Store.YES))
        doc.add(TextField("description", img['description'], Field.Store.YES))
        doc.add(StringField("url", img['url'], Field.Store.YES))
        doc.add(StringField("url_title", img['url_title'], Field.Store.YES))
        return doc

Esempio n. 30

0

Mostra file

def addDoc(w, text):
    """
    add single doc to the index
    :param w: writer
    :param doc_name:
    :param text:
    :param file_name:
    :return:
    """
    doc = Document()
    # TextField: sequence of terms: tokenized
    doc.add(TextField("text", text, Field.Store.YES))
    w.addDocument(doc)