Esempio n. 1
0
def indexDocuments():
    # empty index directory
    indexDir = Wikipedia.directory + 'index/'
    for filename in os.listdir(indexDir):
        os.remove(indexDir + filename)

    # index documents
    lucene.initVM()
    version = Version.LUCENE_CURRENT
    analyzer = EnglishAnalyzer(version)
    writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True,
                         IndexWriter.MaxFieldLength.LIMITED)

    for article in Wikipedia():
        doc = Document()
        doc.add(
            Field('id', str(article['id'][0]), Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('title', article['url'], Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('content', article['text'], Field.Store.NO,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)

    print 'Optimization'
    writer.optimize()
    writer.close()
    def indexDocs(self, root, writer):

        f = codecs.open('picIndex.txt','r',encoding='utf-8')
        picDict = {}
        for line in f.xreadlines():
            ls = line.split('seg^*')
            url = ls[0]
            title = ls[1] 
            src = ls[2]
            alt = ls[3]
            picDict[src] = [url,title,alt]
        f.close()
        for src in picDict:
            doc = Document()
            doc.add(Field("src", src,
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("url", picDict[src][0],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("title", picDict[src][1],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("alt", picDict[src][2],
                                 Field.Store.YES,
                                 Field.Index.ANALYZED))
            writer.addDocument(doc)
 def indexDocs(self, root, writer):
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             if not filename.endswith('.txt'):
                 continue
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 for line in file:
                     doc = Document()
                     arr = line.split('\t')
                     field = Field("name", arr[2].lower(),
                                          Field.Store.YES,
                                          Field.Index.TOKENIZED)
                     field.setBoost(1.5)
                     doc.add(field)
                     doc.add(Field("alternate_names", arr[3].lower(),
                                          Field.Store.YES,
                                          Field.Index.TOKENIZED))
                     doc.add(Field("state", arr[10].lower(),
                                          Field.Store.YES,
                                          Field.Index.TOKENIZED))
                     doc.add(Field("population", arr[14],
                                          Field.Store.YES,
                                          Field.Index.UN_TOKENIZED))
                     if int(arr[14]) > 1000000:
                         doc.setBoost(1.2)
                     writer.addDocument(doc)
                 file.close()
             except Exception, e:
                 print "Failed in indexDocs:", e
Esempio n. 4
0
    def addDocuments(self, dir):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        #
        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs
        #

        for word in self.docs:
            doc = Document()
            doc.add(
                Field("keyword", word, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO))
            doc.add(
                Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Esempio n. 5
0
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        # Elwood
        document = Document()
        document.add(
            Field("owner", "elwood", Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "elwoods sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        # Jake
        document = Document()
        document.add(
            Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "jakes sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        writer.close()
Esempio n. 6
0
    def testUpdate(self):

        self.assertEqual(1, self.getHitCount("city", "Amsterdam"))

        reader = IndexReader.open(self.dir, False)
        reader.deleteDocuments(Term("city", "Amsterdam"))
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO))
        doc.add(
            Field("contents", "St. Petersburg has lots of bridges",
                  Field.Store.NO, Field.Index.ANALYZED))
        doc.add(
            Field("city", "St. Petersburg", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.optimize()
        writer.close()

        self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
        self.assertEqual(1, self.getHitCount("city", "Petersburg"))
Esempio n. 7
0
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("f", "the quick brown fox jumps over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("f", "the quick red fox jumps over the sleepy cat",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))
Esempio n. 8
0
 def addCrowd(self, id, text):
     doc = Document()
     doc.add(
         Field(CrowdFields.id, id, Field.Store.YES,
               Field.Index.NOT_ANALYZED))
     doc.add(
         Field(CrowdFields.text, text, Field.Store.YES,
               Field.Index.ANALYZED))
     self.writer.updateDocument(Term(CrowdFields.id, id), doc)
Esempio n. 9
0
 def addDocuments(self, _id, title, content):
     doc = Document()
     doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED))
     if title is not None and len(title) > 0:
         doc.add(
             Field("titleKeyword", title, Field.Store.NO,
                   Field.Index.ANALYZED))
     if content is not None and len(content) > 0:
         doc.add(
             Field("contentKeyword", content, Field.Store.NO,
                   Field.Index.ANALYZED))
     self.index_writer.addDocument(doc)
Esempio n. 10
0
    def addPoint(self, writer, name, type, x, y):

        doc = Document()
        doc.add(Field("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("type", type, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(
            Field("x", str(x), Field.Store.YES,
                  Field.Index.NOT_ANALYZED_NO_NORMS))
        doc.add(
            Field("y", str(y), Field.Store.YES,
                  Field.Index.NOT_ANALYZED_NO_NORMS))

        writer.addDocument(doc)
Esempio n. 11
0
def _IndexStringField(doc, field_name, field_content):
    #print "This is StringField:",field_content
    if field_content is None:
        return
    else:
        if ("id" == field_name):
            doc.add(
                Field(field_name, str(field_content), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
        else:
            doc.add(
                Field(field_name, str(field_content), Field.Store.YES,
                      Field.Index.ANALYZED))
Esempio n. 12
0
def indexDoc(writer, d):
    doc = Document()
    name_ = Field("name_", d.name,
                  Field.Store.YES, Field.Index.TOKENIZED)
    name_.setBoost(2.0)
    full_text = Field("full_text", d.full,
                  Field.Store.YES, Field.Index.TOKENIZED)
    id = Field("id", str(d.id),
                  Field.Store.YES, Field.Index.UN_TOKENIZED)
    doc.add(name_)
    doc.add(full_text)
    doc.add(id)

    writer.addDocument(doc)
    def indexReader(self, indexWriter, reader, uItem, uAttr, uValue, version):

        STORED = Field.Store.YES
        UN_INDEXED = Field.Index.NO
        UN_TOKENIZED = Field.Index.UN_TOKENIZED

        doc = Document()
        doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED))
        doc.add(Field("version", str(version), STORED, UN_INDEXED))
        reader = StringReader(reader.read())
        doc.add(Field("contents", reader, Field.TermVector.YES))

        indexWriter.addDocument(doc)
def write_metadata(searcher, reader, document_ids, fname):
    allFields = set([])
    docFields = []

    for txtorg_id in document_ids:
        query = TermQuery(Term('txtorg_id',txtorg_id))
        scoreDocs = searcher.search(query, reader.maxDoc()).scoreDocs
        assert len(scoreDocs) == 1
        scoreDoc = scoreDocs[0]
        doc = searcher.doc(scoreDoc.doc)
        df = {}
        for f in doc.getFields():
            field = Field.cast_(f)
            df[field.name()] = field.stringValue()
        docFields.append(df)
        allFields = allFields.union(set(df.keys()))

    
    fields = ['name','path'] + sorted([x for x in allFields if x not in ['name','path']])
    with codecs.open(fname, 'w', encoding='UTF-8') as outf:
        dw = DictUnicodeWriter(outf, fields)
        
        # writing header
        dhead = dict()
        for k in fields:
            dhead[k] = k
        dw.writerow(dhead)
        
        # writing data
        for d in docFields:
            dw.writerow(d)
Esempio n. 15
0
    def findWildcard(self, word, field='key', max=10):
        query = WildcardQuery(Term(field, word))
        searcher = self.searcher
        hits = searcher.search(query, None, max)
        recs = []
        fields = self.fields

        for hit in hits.scoreDocs:
            # i can't figure out how to deal with ScoreDocs instance
            # does it already hold doc object?
            doc = searcher.doc(hit.doc)
            recs.append(doc)

        out = []
        if fields:
            for doc in recs:
                r = {}
                for f in fields:
                    r[f] = doc.get(f)
                out.append(r)
        else:
            for doc in recs:
                r = {}
                for f in doc.fields():
                    f = Field.cast_(f)
                    r[f.name()] = f.stringValue()
                out.append(r)
        return out
Esempio n. 16
0
def createIndex():
    #initialize lucene and jvm
    print("started indexer")
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"

    
    #get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    
    #get index storage
    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
   
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    src_dir = 'html_files'
    i = 0
    for l in os.listdir(src_dir):
        l = os.path.join(src_dir, l)
        with open(l, 'r') as myfile:
            data=myfile.read()
        i += 1
        document, errors = parsehtml(data)
        doc = Document()
        doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    writer.optimize()
    writer.close()
Esempio n. 17
0
    def main(cls, argv):

        if len(argv) < 5:
            print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
            return

        docsInIndex = int(argv[1])

        # create an index called 'index-dir' in a temp directory
        indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
                                'index-dir')
        dir = FSDirectory.getDirectory(indexDir, True)
        analyzer = SimpleAnalyzer()
        writer = IndexWriter(dir, analyzer, True)

        # set variables that affect speed of indexing
        writer.setMergeFactor(int(argv[2]))
        writer.setMaxMergeDocs(int(argv[3]))
        writer.setMaxBufferedDocs(int(argv[4]))
        # writer.infoStream = System.out

        print "Merge factor:  ", writer.getMergeFactor()
        print "Max merge docs:", writer.getMaxMergeDocs()
        print "Max buffered docs:", writer.getMaxBufferedDocs()

        start = time()
        for i in xrange(docsInIndex):
            doc = Document()
            doc.add(
                Field("fieldname", "Bibamus", Field.Store.YES,
                      Field.Index.TOKENIZED))
            writer.addDocument(doc)

        writer.close()
        print "Time: ", timedelta(seconds=time() - start)
Esempio n. 18
0
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(
            Field("description", "Illidium Space Modulator", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
Esempio n. 19
0
def luceneIndexer(contents):
    lucene.initVM()
    

    INDEXIDR= settings.INDEX_DIR

    indexdir= SimpleFSDirectory(File(INDEXIDR))
    
    analyzer= StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))
    for tfile in contents:
        print"Indexing: ", tfile

        document= Document()

        content= tfile.getvalue()

        document.add(Field("text",content,Field.Store.YES,\
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
        print"Done: ", tfile
        index_writer.optimize()
        print index_writer.numDocs()
    index_writer.close()
Esempio n. 20
0
def configure_lucene():

    f = open('clique.txt', 'r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t', '')
        line = line.replace('\r', '')
        line = line.replace('\n', '')
        line = line.replace('^', '')
        line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (
        writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
Esempio n. 21
0
def reindex_all(reader, writer, analyzer):
    for i in xrange(reader.maxDoc()):
        if reader.isDeleted(i): continue
        doc = reader.document(i)
        p = doc.get("path")
        pkid = doc.get('txtorg_id')
        if p is None:
            # No filepath specified, just use original document
            writer.updateDocument(Term("txtorg_id",pkid),doc,analyzer)
        else:
            # if a path field is found, try to read the file it points to and add a contents field
            edited_doc = Document()
            for f in doc.getFields():
                edited_doc.add(Field.cast_(f))

            try:
                inf = open(p)
                contents = unicode(inf.read(), 'UTF-8')
                inf.close()

                if len(contents) > 0:
                    edited_doc.add(Field("contents", contents,
                                         Field.Store.NO,
                                         Field.Index.ANALYZED,
                                         Field.TermVector.YES))
                else:
                    print "warning: no content in %s" % filename
            except:
                print "Could not read file; skipping"
            writer.updateDocument(Term("txtorg_id",pkid),edited_doc,analyzer)
    def indexValue(self, indexWriter, value, uItem, uAttr, uValue, version):

        STORED = Field.Store.YES
        UN_STORED = Field.Store.NO
        TOKENIZED = Field.Index.TOKENIZED
        UN_INDEXED = Field.Index.NO
        UN_TOKENIZED = Field.Index.UN_TOKENIZED

        doc = Document()
        doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED))
        doc.add(Field("version", str(version), STORED, UN_INDEXED))
        doc.add(
            Field("contents", value, UN_STORED, TOKENIZED,
                  Field.TermVector.YES))
        indexWriter.addDocument(doc)
Esempio n. 23
0
    def testFuzzy(self):

        self.indexSingleFieldDocs([Field("contents", "fuzzy", Field.Store.YES,
                                         Field.Index.ANALYZED),
                                   Field("contents", "wuzzy", Field.Store.YES,
                                         Field.Index.ANALYZED)])

        searcher = IndexSearcher(self.directory)
        query = FuzzyQuery(Term("contents", "wuzza"))
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(2, len(scoreDocs), "both close enough")

        self.assert_(scoreDocs[0].score != scoreDocs[1].score,
                     "wuzzy closer than fuzzy")
        self.assertEqual("wuzzy",
                         searcher.doc(scoreDocs[0].doc).get("contents"),
                         "wuzza bear")
Esempio n. 24
0
    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: T9er <WordNet index dir> <t9 index>"
            return

        for key in cls.keys:
            c = key[0]
            k = key[1:]
            for kc in k:
                cls.keyMap[kc] = c
                print kc, "=", c

        indexDir = argv[1]
        t9dir = argv[2]

        reader = IndexReader.open(indexDir)

        numDocs = reader.maxDoc()
        print "Processing", numDocs, "words"

        writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)

        for id in xrange(reader.maxDoc()):
            origDoc = reader.document(id)
            word = origDoc.get("word")
            if word is None or len(word) == 0:
                continue

            newDoc = Document()
            newDoc.add(
                Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("t9", cls.t9(word), Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("length", str(len(word)), Field.Store.NO,
                      Field.Index.UN_TOKENIZED))
            writer.addDocument(newDoc)
            if id % 100 == 0:
                print "Document", id

        writer.optimize()
        writer.close()

        reader.close()
Esempio n. 25
0
    def indexFile(self, writer, path):

        try:
            file = open(path)
            string = HTMLReader(InputStreamReader(file, 'utf-8')).read()
            file.close()
        except:
            raise
        else:
            doc = Document()
            doc.add(Field("contents", StringReader(string)))
            doc.add(
                Field("filename", os.path.abspath(path), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

            return doc
Esempio n. 26
0
 def indexDocs(self, root, writer):
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             if not filename.endswith('.txt'):
                 continue
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 for line in file:
                     doc = Document()
                     arr = line.split('\t')
                     field = Field("name", arr[2].lower(), Field.Store.YES,
                                   Field.Index.TOKENIZED)
                     field.setBoost(1.5)
                     doc.add(field)
                     doc.add(
                         Field("alternate_names", arr[3].lower(),
                               Field.Store.YES, Field.Index.TOKENIZED))
                     doc.add(
                         Field("state", arr[10].lower(), Field.Store.YES,
                               Field.Index.TOKENIZED))
                     doc.add(
                         Field("population", arr[14], Field.Store.YES,
                               Field.Index.UN_TOKENIZED))
                     if int(arr[14]) > 1000000:
                         doc.setBoost(1.2)
                     writer.addDocument(doc)
                 file.close()
             except Exception, e:
                 print "Failed in indexDocs:", e
Esempio n. 27
0
    def _addDoc(self, text, writer):
        """
		function to add documents in the lucene index. 
		text fields are indexed by the name "field"
		"""

        doc = Document()
        doc.add(Field("field", text, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
Esempio n. 28
0
def get_fields_and_values(reader, max_vals = 30):
    all_fields = defaultdict(set)

    for i in xrange(reader.maxDoc()):
        if reader.isDeleted(i): continue
        doc = reader.document(i)
        for f in doc.getFields():
            field = Field.cast_(f)
            if len(all_fields[field.name()]) < max_vals: all_fields[field.name()].add(field.stringValue())

    return dict(all_fields)
Esempio n. 29
0
    def addDocuments(self, dir, maxFieldLength):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength(maxFieldLength))
        
        for keyword, unindexed, unstored, text in \
                izip(self.keywords, self.unindexed, self.unstored, self.text):
            doc = Document()
            doc.add(Field("id", keyword,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("country", unindexed,
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("contents", unstored,
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("city", text,
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
Esempio n. 30
0
    def index(self, doc, title, department, url):
        indexdir = SimpleFSDirectory(File(self.indir))
        analyzer = StandardAnalyzer(Version.LUCENE_30)
        index_writer = IndexWriter(indexdir, analyzer, self.init,
                                   IndexWriter.MaxFieldLength(512))
        self.init = False

        # Initialize document and index it
        document = Document()
        document.add(
            Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED))
        document.add(
            Field("department", department, Field.Store.YES,
                  Field.Index.ANALYZED))
        document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED))
        index_writer.addDocument(document)

        index_writer.optimize()
        index_writer.close()
Esempio n. 31
0
def parse_file(file_path, writer):
    f = open(file_path, 'r')
    soup = BeautifulSoup(f.read())
    f.close()
    doc = Document()
    content_tags = ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
    content = ""
    for tag in content_tags:
        matches = soup.find_all(tag)
        for match in matches:
            if match.string:
                content += match.string + " "
    afinn_score = afinn.sentiment(content)
    doc.add(Field("filepath", file_path, Field.Store.YES,
                  Field.Index.ANALYZED))
    doc.add(Field("content", content, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(
        Field("sentiment", str(afinn_score), Field.Store.YES,
              Field.Index.ANALYZED))
    writer.addDocument(doc)
Esempio n. 32
0
    def indexFile(self, writer, path):

        doc = Document()

        try:
            process = popen2.Popen4(["antiword", "-m", "UTF-8", path])
            string = InputStreamReader(process.fromchild, 'utf-8').read()
        except:
            raise
        else:
            doc.add(Field("contents", StringReader(string)))
            doc.add(Field("filename", os.path.abspath(path),
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

            exitCode = process.wait()
            if exitCode != 0:
                raise RuntimeError, "pdftotext exit code %d" %(exitCode)

            return doc
Esempio n. 33
0
    def testWildcard(self):

        self.indexSingleFieldDocs([Field("contents", "wild", Field.Store.YES,
                                         Field.Index.ANALYZED),
                                   Field("contents", "child", Field.Store.YES,
                                         Field.Index.ANALYZED),
                                   Field("contents", "mild", Field.Store.YES,
                                         Field.Index.ANALYZED),
                                   Field("contents", "mildew", Field.Store.YES,
                                         Field.Index.ANALYZED)])

        searcher = IndexSearcher(self.directory)
        query = WildcardQuery(Term("contents", "?ild*"))
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(3, len(scoreDocs), "child no match")

        self.assertEqual(scoreDocs[0].score, scoreDocs[1].score,
                         "score the same")
        self.assertEqual(scoreDocs[1].score, scoreDocs[1].score,
                         "score the same")
Esempio n. 34
0
    def setUp(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc1 = Document()
        doc1.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc1)

        doc2 = Document()
        doc2.add(
            Field("field", "the fast fox hopped over the hound",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc2)
        writer.close()

        self.searcher = IndexSearcher(directory, True)
Esempio n. 35
0
    def someMethod(self):

        directory = RAMDirectory()

        analyzer = StandardAnalyzer()
        writer = IndexWriter(directory, analyzer, True)

        doc = Document()
        doc.add(Field.Text("title", "This is the title"))
        doc.add(Field.UnStored("contents", "...document contents..."))
        writer.addDocument(doc)

        writer.addDocument(doc, analyzer)

        expression = "some query"

        query = QueryParser.parse(expression, "contents", analyzer)

        parser = QueryParser("contents", analyzer)
        query = parser.parseQuery(expression)
Esempio n. 36
0
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, self.porterAnalyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("contents",
                      "The quick brown fox jumps over the lazy dogs",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()
def write_contents(allDicts, searcher, reader, fname, content_field = "contents"):
    all_ids = [d['txtorg_id'] for d in allDicts]

    all_fields = set()
    doc_fields = []
    for txtorg_id in all_ids:
        query = TermQuery(Term('txtorg_id',txtorg_id))
        scoreDocs = searcher.search(query, reader.maxDoc()).scoreDocs
        assert len(scoreDocs) == 1
        scoreDoc = scoreDocs[0]
        doc = searcher.doc(scoreDoc.doc)
        df = {}
        name_path_present = False
        failFlag = False
        for f in doc.getFields():
            field = Field.cast_(f)
            if content_field == "contents" and field.name() == 'path':
                name_path_present = True
                path = doc.get("path").encode('utf-8')
                try:
                    i = codecs.open(path, 'r', encoding='UTF-8')
                    c = i.read()
                    df[content_field] = c
                    i.close()
                except Exception as e:
                    failFlag = True
                    print "Failed for path %s with exception %s" % (path, e)
            elif field.name() in ['txtorg_id', 'name', 'path', content_field]:
                df[field.name()] = field.stringValue()
        
        all_fields = all_fields.union(set(df.keys()))
        doc_fields.append(df)

    fields = ['txtorg_id'] + sorted([x for x in all_fields if x != 'txtorg_id'])
    with codecs.open(fname, 'w', encoding='UTF-8') as outf:
        dw = csv.DictWriter(outf, fields)
        dw.writeheader()
        
        # writing data
        for d in doc_fields:
            dw.writerow(d)

    return failFlag
def add_metadata_to_doc(lucenedoc,fieldnames,values):
    edited_doc = Document()
    filepath = lucenedoc.get("path")
    assert filepath is not None

    # Include all original fields that are not in the list of updates
    original_fields = []
    for f in lucenedoc.getFields():
        field = Field.cast_(f)
        if field.name() not in fieldnames:
            original_fields.append(field)

    for field in original_fields:
        edited_doc.add(field)
                
    # Now, add back the unstored "contents" field
    try:
        file = open(filepath)
        contents = unicode(file.read(), 'UTF-8')
        file.close()

        if len(contents) > 0:
            edited_doc.add(Field("contents", contents,
                                 Field.Store.NO,
                                 Field.Index.ANALYZED,
                                 Field.TermVector.YES))
        else:
            print "warning: no content in %s" % filename
    except:
        print "Could not read file; skipping"
        return None

    # Now include new fields
    for idx in range(len(fieldnames)):
        edited_doc.add(Field(fieldnames[idx].lower(),values[idx].lower(),Field.Store.YES,Field.Index.NOT_ANALYZED))

    return edited_doc
Esempio n. 39
0
    def index_file(self, path):
        print path
        YES = Field.Store.YES
        NO = Field.Store.NO
        NOT_ANALYZED = Field.Index.NOT_ANALYZED
        ANALYZED = Field.Index.ANALYZED
        metadata = self.metadata

        handle = codecs.open(path, encoding='utf8')
        filename = os.path.basename(path)
        if metadata:
            user_number = WebPage.get(filename).user
            user_record = list(User.select(User.q.number == user_number))[0]
            gender = (user_record.sex if user_record.sex is not None 
                    else "Unknown")
            birthyear = (str(user_record.birthyear) 
                    if user_record.birthyear is not None else '0')

        s = handle.read()

        pos = s.find(u'\n\n') + 2    # Skip first block: declaration
        nextpos = 0
        sentence_index = 0

        doc = Document()
        sentence_index_field = Field("sentence_index", str(sentence_index), YES,
            NOT_ANALYZED)
        user_field = Field("user", '', YES, NOT_ANALYZED)
        gender_field = Field("gender", '', YES, NOT_ANALYZED)
        birthyear_field = Field("birthyear", '', YES, NOT_ANALYZED)
        filename_field = Field("filename", '', YES, NOT_ANALYZED)
        store_contents = NO if self.compress else YES
        contents_field = Field("contents", '', store_contents, ANALYZED)
        # change this to Field('compressed', '', NO)? Second argument needs to
        # be bytes, so maybe '\xfe'
        if self.compress:
            compressed_field = Field("compressed", '\xfe', YES, NOT_ANALYZED)
        doc.add(sentence_index_field)
        if metadata:
            doc.add(user_field)
            doc.add(gender_field)
            doc.add(birthyear_field)
            doc.add(filename_field)
        doc.add(contents_field)

        if self.compress:
            doc.add(compressed_field)

        while nextpos != len(s):
            nextpos = s.find(u'\n \n', pos)
            if nextpos == -1:
                nextpos = len(s)
            text = s[pos:nextpos]
            text = text.replace(u' ', u'@')
            pos = nextpos + 2

            sentence_index_field.setValue(str(sentence_index))
            if metadata:
                user_field.setValue(user_number)
                gender_field.setValue(gender)
                birthyear_field.setValue(birthyear)
                filename_field.setValue(filename)

            contents_field.setValue(text)
            # todo:
            # remove header and checksum added by zlib
            # make sure what gets stored is binary rather than unicode
            if self.compress:
                compressed = self.compressor.compress(text.encode('utf8'))
                compressed_field.setValue(compressed)
            self.writer.addDocument(doc)
            sentence_index = sentence_index + 1