コード例 #1
0
    def __init__(self, folder=None, fields=[], similarity="tfidf"):

        self.jcc = lucene.initVM()

        if folder:
            self.directory = SimpleFSDirectory(File(folder))
        else:
            self.directory = RAMDirectory()

        self.fields = {}

        for field in fields:
            ft = FieldType()
            for pname, pvalue in field.props.items():
                setter = getattr(ft, "set" + pname.capitalize())
                setter(pvalue)

            ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
            # 			ft.setOmitNorms(True)

            self.fields[field.name] = ft

        self.similarity = similarity.lower()
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.writer = None
        self.searcher = None
コード例 #2
0
    def indexsents(self, sentences, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for i, sent in enumerate(sentences):
            # print 'adding',i, sent
            try:
                root = os.getcwd()
                # contents = unicode(sent, 'iso-8859-1')
                doc = Document()
                doc.add(Field('name', str(i), t1))
                doc.add(Field('path', root, t1))

                if len(sent) > 0:
                    doc.add(Field('contents', sent.lower(), t2))
                else:
                    print('warning: no content in %s' % str(i))

                writer.addDocument(doc)
            except Exception as e:
                print('Failed in indexsents:', e)

        writer.commit()
        writer.close()
コード例 #3
0
ファイル: IndexFiles.py プロジェクト: w2wei/XPRC
 def indexDocs(self, root, writer):
     t1 = FieldType() # for short items, e.g. file name.
     t1.setIndexed(True)
     t1.setStored(True)
     t1.setTokenized(False)
     t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # DOCS_AND_FREQS_AND_POSITIONS_OFFSETS
     
     t2 = FieldType() # for content
     t2.setIndexed(True)
     t2.setStored(False) # don't store the original text
     t2.setTokenized(True)
     t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
     
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 contents = unicode(file.read(), 'iso-8859-1')
                 file.close()
                 doc = Document()
                 doc.add(Field("name", filename, t1))
                 doc.add(Field("path", root, t1))
                 if len(contents) > 0:
                     doc.add(Field("contents", contents, t2))
                 else:
                     print "warning: no content in %s" % filename
                 writer.addDocument(doc)
             except Exception, e:
                 print "Failed in indexDocs:", e
コード例 #4
0
ファイル: indexer.py プロジェクト: zz-mars/rzync
    def indexDocs(self, root, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            # traverse through the doc directory
            for filename in filenames:
                #	if not filename.endswith('.cdc'):
                #		continue
                try:
                    # only add the filename and path for indexing
                    path = os.path.join(root, filename)
                    print "adding file : ", path
                    file = open(path)
                    contents = unicode(file.read(), 'utf-8')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in ", filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "failed in indexDocs:", e
コード例 #5
0
    def index_image(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        with open(os.path.join(root, "index.txt"), mode="r",
                  encoding="utf8") as index:
            count = 1
            for line in index:
                print("\r", count, end="", sep="")
                try:
                    image_url, content = line.strip().split()[:2]
                except ValueError as e:
                    print(e)
                    continue
                doc = Document()
                doc.add(Field("raw_content", content, t1))
                content = " ".join(
                    word for word in jieba.cut_for_search(content)
                    if word.strip() and word not in self.stop_words)

                doc.add(Field("url", image_url, t1))
                doc.add(Field("content", content, t2))
                writer.addDocument(doc)
                count += 1
            print("\n{count} image(s) added.".format(count=count))
コード例 #6
0
ファイル: indexer.py プロジェクト: JinwooSeong/thu-IR-hw4
    def Indexing(self, writer):
        print("Indexing Segmented File [", SEGMENTATION_FILE, "]")
        with open(SEGMENTATION_FILE, 'r') as f:
            line_count = 0
            for line in f:
                # 建立 context 的 fieldtype,需要搭建索引、存储、向量化
                fieldtype_context = FieldType()
                fieldtype_context.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
                fieldtype_context.setStored(True)
                fieldtype_context.setTokenized(True)

                # 建立 phrase 的 fieldtype,只需要保存
                fieldtype_phrase = FieldType()
                fieldtype_phrase.setStored(True)

                # 对分词好的内容进行处理,把词语和词性分开来存储
                processed_context, processed_phrase = self.process_line(line)

                doc = Document()
                # context field是用于记录文章的内容
                doc.add(Field('context', processed_context, fieldtype_context))
                # phrase field适用于记录文章每个词所对应的词性
                doc.add(Field('phrase', processed_phrase, fieldtype_phrase))

                # 把document写入索引库
                writer.addDocument(doc)

                # 跟踪程序运行情况用
                print("\r", str(line_count), " lines", end="", flush=True)
                line_count = line_count + 1
                if line_count > self.index_limit and not self.training:
                    break

        writer.close()
        print()
コード例 #7
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.txt'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'gbk')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
コード例 #8
0
ファイル: IndexFiles.py プロジェクト: rookie5372/FaceAlbum
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        index_file = open("index.txt", 'r')
        for line in index_file.readlines():

            try:
                src = line.strip().split('\t')[0]
                filename = line.strip().split('\t')[1]
                tag = line.strip().split('\t')[2]
                path = os.path.join(root, filename)

                doc = Document()
                doc.add(Field("name", filename, t1))
                doc.add(Field("path", root, t1))
                doc.add(Field("src", src, t1))

                if len(tag) > 0:
                    doc.add(Field("tag", tag, t2))
                else:
                    print "warning: no tag in %s" % filename
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e
コード例 #9
0
ファイル: q1_index.py プロジェクト: ashayaan/text-processing
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setStoreTermVectors(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPositions(True)
        t2.setIndexOptions(
            IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

        file_path = root + 'r52-train-all-terms.txt'
        fd = open(file_path)
        contents = fd.readlines()
        fd.close()
        contents_list = [x.strip() for x in contents]
        for i in xrange(len(contents_list)):
            try:
                [topic, content] = contents_list[i].split('\t')
                doc = Document()
                doc.add(Field("id", str(i), t1))
                doc.add(Field("topic", topic, t1))
                doc.add(Field("contents", content, t2))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e
コード例 #10
0
ファイル: index_sent.py プロジェクト: sougata09/CNN-QA
    def indexsents(self, sentences, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for i, sent in enumerate(sentences):
            #print "adding",i, sent
            try:
                root = os.getcwd()
                #contents = unicode(sent, 'iso-8859-1')
                doc = Document()
                doc.add(Field("name", str(i), t1))
                doc.add(Field("path", root, t1))
                if len(sent) > 0:
                    doc.add(Field("contents", sent.lower(), t2))
                else:
                    print "warning: no content in %s" % str(i)
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexsents:", e
コード例 #11
0
    def indexDocs(self, root, iw):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for filename in os.listdir(root):
            if not filename.endswith(".txt"):
                print("file is not a txt file. we skip it.")
                continue
            print("adding", filename)
            path = os.path.join(root, filename)
            self.parseBook(path, t1, t2, iw)

        # Prints a set of statistics displaying missing data
        # Authorerror = number of authors not found
        # Titleerror = number of titles not found
        # Documenterror = number of documents where text could not be extracted so entire document was indexed
        print("AuthorError: {}".format(self.authorcount))
        print("TitleError: {}".format(self.titlecount))
        print("DocumentError: {}".format(self.errorcount))
        iw.close()
コード例 #12
0
    def indexer(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        def repalcer(text):
            chars = '\\`*_{}[]()>#+-.!$‘'
            for c in chars:
                if c in text:
                    text = text.replace(c, ' ')
            return text

        for root, dirnames, filenames in os.walk(root):
            i = 0
            for filename in filenames:
                i += 1
                with open(os.path.join(root, filename)) as f:
                    for line in f.readlines():
                        line = line.split(' ', 2)
                        docname = line[0] + ' ' + line[1]
                        name = repalcer(line[0])
                        contents = line[2]
                        doc = Document()
                        doc.add(Field('docname', docname, t1))
                        doc.add(Field('name', name, t1))
                        doc.add(Field('contents', contents, t1))
                        writer.addDocument(doc)
                print('File %d done indexing' % i)
コード例 #13
0
def build_index(document_path, dir_path):
    lucene.initVM()
    index_dir = SimpleFSDirectory(Paths.get(dir_path))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    index_writer = IndexWriter(index_dir, config)

    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(True)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    t2 = FieldType()
    t2.setStored(True)
    t2.setTokenized(False)

    with open(document_path) as input_file:
        for line in input_file:
            segs = line.strip().split(" ")
            music_path, music_tags = segs[0], segs[1].split(",")

            document = Document()
            document.add(Field("content", " ".join(music_tags), t1))
            document.add(Field("url", music_path, t2))
            index_writer.addDocument(document)

    index_writer.close()
コード例 #14
0
def index_docs(root, writer):
    # metadata: name and path
    metadata = FieldType()
    metadata.setStored(True)  # as is value
    metadata.setTokenized(False)
    metadata.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    # content: abstract and body
    content_type = FieldType()
    content_type.setStored(True)  # to highlight on search results
    content_type.setTokenized(True)  # tokenize words
    content_type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    for directory, _, file_names in walk(root):
        for file_name in file_names:
            name, extension = splitext(file_name)
            if extension not in DOC_FORMATS:
                continue  # skip unsupported formats

            file_path = join(directory, file_name)
            print ' ', file_path

            # Build indexed document
            doc = Document()
            doc.add(Field('name', file_name, metadata))
            doc.add(Field('path', directory, metadata))

            # Read file contents
            content = process(file_path, 'utf-8', method='pdfminer')
            abstract = extract_abstract(content)
            doc.add(Field('content', content, content_type))
            doc.add(Field('abstract', abstract, content_type))

            writer.addDocument(doc)
コード例 #15
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        inFile = open(str(args["inputFile"]))
        indexName = inFile.readline()
        while (indexName != ''):
            print "adding", indexName
            doc = Document()
            doc.add(Field("name", indexName, t1))
            #doc.add(Field("path", root, t1))
            text = inFile.readline()
            if (len(text) > 0):
                print("contents: %s\n" % text)
                doc.add(Field("contents", text, t2))
            else:
                print "warning: no content in %s" % indexName
            indexName = inFile.readline()
            writer.addDocument(doc)
        inFile.close()
コード例 #16
0
    def build_index(self, dict_data):
        print("loading data...")
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for k, v in dict_data.items():
            doc = Document()
            doc.add(Field("id", k, t1))
            doc.add(Field("content", v, t2))
            self.writer.addDocument(doc)

        ticker = Ticker()
        print("commit index")
        threading.Thread(target=ticker.run).start()
        self.writer.commit()
        self.writer.close()
        ticker.tick = False
        print("done")
コード例 #17
0
    def indexDocs(self, img_url, toi, tid):

        try:
            t1 = FieldType()
            t1.setIndexed(True)
            t1.setStored(True)
            t1.setTokenized(True)
            t1.setIndexOptions(
                FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

            print("Adding", img_url)
            name = "Pictures/1.jpg"
            conn = urllib.urlopen(img_url)
            f = open(name, 'wb')
            f.write(conn.read())
            f.close()
            img = cv2.imread(name)
            sdf = img_search_color(img)
            storeDir = 'Picture_new/' + sdf.strs
            if not os.path.exists(storeDir):
                os.mkdir(storeDir)
            cv2.imwrite(storeDir + '/' + str(toi) + '___' + str(tid) + '.jpg',
                        img)
            '''storeDir2 = 'Picture_user/'+str(tid)
			if not os.path.exists(storeDir2):
				n = 0
				os.mkdir(storeDir2)
			else :
				n = len(os.listdir(storeDir2))
			cv2.imwrite(storeDir2+'/'+str(toi)+ '_'+ str(n) + '.jpg',img)'''
        except Exception, e:
            print("Failed in indexDocs:", e)
コード例 #18
0
ファイル: index.py プロジェクト: sangheestyle/nlp2014
    def index_docs(self, train_set, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for ii in train_set:
            doc = Document()
            doc.add(Field("answer", ii['Answer'], t1))
            doc.add(Field("qid", ii['Question ID'], t1))
            doc.add(Field("category", ii['category'], t1))
            doc.add(Field("position", ii['Sentence Position'], t1))
            doc.add(Field("question", ii['Question Text'], t2))
            doc.add(Field("wiki_plain",
                          self.wiki_reader.get_text(ii['Answer']), t2))
            writer.addDocument(doc)
コード例 #19
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.html'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'iso-8859-1')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
コード例 #20
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        wikiFile = ZipFile(root, 'r')
        files = wikiFile.namelist()

        i = 0
        for file in files[1:]:
            i += 1
            wiki = wikiFile.open(file, 'r')
            for line in wiki:
                for line in codecs.iterdecode(wiki, 'utf8'):
                    normailized = unicodedata.normalize('NFD',
                                                        line).split(' ', 2)
                    if not normailized[1].isdigit(): continue
                    docname = normailized[0] + ' ' + normailized[1]
                    name = re.sub(r'[^a-zA-Z0-9]', ' ', normailized[0])
                    contents = normailized[2]
                    doc = Document()
                    doc.add(Field('docname', docname, t1))
                    doc.add(Field('name', name, t1))
                    doc.add(Field('contents', contents, t1))
                    writer.addDocument(doc)
            print('File %d done indexing' % i, file)
コード例 #21
0
    def index_docs(self, tweets, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        t1.setStoreTermVectors(True)
        t1.setStoreTermVectorOffsets(True)

        # add each tweet to the index
        for tweet in tweets:
            try:
                # strip out URLs because they provide false index matches
                contents = []
                for word in tweet[1].text.split():
                    if word.startswith("http://") or word.startswith("https://"):
                        continue
                    contents.append(word)
                contents = " ".join(contents)

                if len(contents) == 0: continue

                doc = Document()
                doc.add(Field("contents", contents, t1))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in index_docs:", e
コード例 #22
0
ファイル: IndexFeeds.py プロジェクト: yelinkyaw/FeedsIndexer
    def indexDocs(self, url, writer):
        type1 = FieldType()
        type1.setIndexed(True)
        type1.setStored(True)
        type1.setTokenized(False)
        type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        type2 = FieldType()
        type2.setIndexed(True)
        type2.setStored(True)
        type2.setTokenized(True)
        type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
        
        # Read Feeds
        feeds = feedparser.parse(url)

        for item in feeds["entries"]:
            print "adding", item["title"] 
            try:
                link = item["link"] 
                contents = item["description"].encode("utf-8")
                contents = re.sub('<[^<]+?>', '', ''.join(contents))
                title = item["title"]
                doc = Document()
                doc.add(Field("url", link, type1))
                doc.add(Field("title", title, type1))
                if len(contents) > 0:
                    doc.add(Field("contents", contents, type2))
                else:
                    print "warning: no content in %s" % item["title"] 
                writer.addDocument(doc)
            except Exception, e:
                 print "Failed in indexDocs:", e
コード例 #23
0
    def indexDocs(self, sourceDir, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for sourceDir, dirnames, filenames in os.walk(sourceDir):
            for filename in filenames:
                if not filename.endswith('.txt'):
                    continue
                print(filename)
                try:
                    path = os.path.join(sourceDir, filename)
                    file = open(path, 'r', encoding="utf-8")
                    contents = file.read()
                    #contents = str(filecontent, 'utf-8')
                    #contents = filecontent.encode('utf-8')
                    #print('path', path, len(contents))
                    doc = Document()
                    doc.add(Field("name", filename, t1))  # filename (title)
                    #doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field(queryField, contents, t2))  # content
                    else:
                        print("warning: no content in %s" % filename)
                    writer.addDocument(doc)
                    file.close()
                except NameError:
                    print("Failed in indexDocs:")
コード例 #24
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(False)
        t1.setStored(True)
        t1.setTokenized(False)

        t3 = FieldType()
        t3.setIndexed(True)
        t3.setStored(False)
        t3.setTokenized(True)#利用预先设置的analyzer进行分词,这里是根据空格
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        total=0
        file = open(root,"r")
        for line in file.readlines():
            try:
                imgurl, itemurl, content = line.split('\t')
                total+=1
                print total
                print "adding", content
                contents = ' '.join(jieba.cut(content))
                doc = Document()
                doc.add(Field("imgurl", imgurl, t1))
                doc.add(Field("itemurl", itemurl, t1))
                doc.add(Field("title", content, t1))
                doc.add(Field("contents",contents,t3))
                writer.addDocument(doc)
            except Exception, e:
                    print "Failed in indexDocs:", e
コード例 #25
0
def _createNoTermsFrequencyFieldType():
    f = FieldType()
    f.setIndexed(True)
    f.setTokenized(True)
    f.setOmitNorms(True)
    f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
    f.freeze()
    return f
コード例 #26
0
def _createNoTermsFrequencyFieldType():
    f = FieldType()
    f.setIndexed(True)
    f.setTokenized(True)
    f.setOmitNorms(True)
    f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
    f.freeze()
    return f
コード例 #27
0
    def indexDocs(self, root, writer, urlDic):

        t1 = FieldType()
        t1.setIndexed(False)
        t1.setStored(True)
        t1.setTokenized(False)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t3 = FieldType()
        t3.setIndexed(True)
        t3.setStored(True)
        t3.setTokenized(False)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.htm') and not filename.endswith(
                        '.html') and not filename.endswith(
                            '.com') and not filename.endswith('.cn'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)

                    url = urlDic[filename]
                    proto, rest = urllib.splittype(url)
                    site, rest = urllib.splithost(rest)

                    file = open(path)
                    contents = file.read()
                    file.close()

                    soup = BeautifulSoup(contents, features='html.parser')
                    title = soup.title.string
                    title = unicode(title).encode('utf-8')
                    title = title.replace("\n", '')

                    contents = soup.get_text().encode('utf-8')
                    seg_list = jieba.cut(contents)
                    contents = " ".join(seg_list)

                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", path, t1))
                    doc.add(Field("title", title, t1))
                    doc.add(Field("url", url, t1))
                    doc.add(Field("site", site, t3))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
コード例 #28
0
    def indexDocs(self, root, indextxt, writer):

        t1 = FieldType()
        t1.setIndexed(True)  #t1为需索引, 需保存, 需分词
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()  #t2为不需索引, 需保存, 需分词
        t2.setIndexed(False)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:

                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    content = unicode(file.read(), 'utf-8')
                    content = content.encode('utf-8')
                    list1 = content.split('\n')
                    file.close()

                    doc = Document()

                    # 读取存于文件的内容并保存于doc
                    url = list1[0]
                    print('url : ' + url)
                    doc.add(Field("url", url, t1))

                    name = list1[1]
                    print('name : ' + name)
                    doc.add(Field("name", name, t1))

                    collectnum = list1[2]
                    print('collect_num : ' + collectnum)
                    doc.add(Field("collect_num", collectnum, t2))

                    img_url = list1[3]
                    print('img_url : ' + img_url)
                    doc.add(Field("img_url", img_url, t2))

                    zhuliao = list1[4]
                    print(zhuliao)
                    doc.add(Field("zhuliao", zhuliao, t1))

                    zuofa = list1[5]
                    zuofa = '\n'.join(zuofa.split('\t'))
                    print('zuofa : ' + zuofa)
                    doc.add(Field("zuofa", zuofa, t2))

                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
コード例 #29
0
    def indexDocs(self, root, writer):

        # t1 is used for filenames and t2 is used for contents
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                # We can index only a certain types of files
                if not (filename.endswith('.txt') or filename.endswith('.pdf') or filename.endswith('.xml') or filename.endswith('.doc') or filename.endswith('.odt')):
                    continue
                try:
                    file_path = os.path.join(root, filename)
                    outfile_path = file_path

                    # First convert PDF and DOC files to text
                    if filename.endswith('.pdf'):
                        outfile = filename.replace('.pdf', '.txt')
                        outfile_path = os.path.join(root, outfile)
                        cmd = 'pdftotext ' + '-layout ' + "'"+ file_path +  "'" + ' ' + "'" + outfile_path + "'"
                        subprocess.check_output(cmd, shell=True)
                        file_path = outfile_path
                    elif filename.endswith('.doc'):
                        outfile = filename.replace('.doc', '.txt')
                        outfile_path = os.path.join(root, outfile)
                        cmd = 'antiword ' +  file_path + ' >> ' + outfile_path
                        subprocess.check_output(cmd, shell=True)
                        file_path = outfile_path
                    elif filename.endswith('.odt'):
                        outfile = filename.replace('.odt', '.txt')
                        outfile_path = os.path.join(root, outfile)
                        cmd = 'odttotext ' + '-layout ' + "'"+ file_path +  "'" + ' ' + "'" + outfile_path + "'"
                        subprocess.check_output(cmd, shell=True)
                        file_path = outfile_path

                    file = open(file_path)
                    contents = unicode(file.read(), 'iso-8859-1')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        logging.debug('warning: no content in %s', filename)
                    writer.addDocument(doc)
                except Exception, e:
                    logging.debug('Failed in indexDocs: %s', e)
コード例 #30
0
    def indexDocs_playlist(self, writer):

        t1 = FieldType()
        t1.setIndexed(False)
        t1.setStored(True)
        t1.setTokenized(False)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        # playlist
        success = 0
        fail = 0
        noinfo = 0

        playlists = open('data/playlist_details2.db', "r")
        for line in playlists.readlines():
            if len(line) < 20:
                noinfo += 1
                continue

            information = line.split(',')
            try:
                playID = information[0]
                playname = ' '.join(information[1:-8])
                author, playImage, tags, songIDs, sharecount, playcount, subscribedcount, commentcount = information[
                    -8:]
            except Exception, e:
                fail += 1
                print "fail"
                continue

            playname = ' '.join(jieba.cut(playname))
            author = ' '.join(jieba.cut(author))
            tags = ' '.join(jieba.cut(tags))
            # playname = ' '.join(pynlpir.segment(playname, pos_tagging=False))
            # author = ' '.join(pynlpir.segment(author, pos_tagging=False))
            tags = tags.replace("|", " ")

            doc = Document()
            doc.add(Field("ID", playID, t2))
            doc.add(Field("name", playname, t2))
            doc.add(Field("author", author, t2))
            doc.add(Field("image", playImage, t1))
            doc.add(Field("tags", tags, t2))
            doc.add(Field("songIDs", songIDs, t1))
            doc.add(Field("sharecount", sharecount, t1))
            doc.add(Field("playcount", playcount, t1))
            doc.add(Field("subscribedcount", subscribedcount, t1))
            doc.add(Field("commentcount", commentcount, t1))
            writer.addDocument(doc)
            print "歌单", playname, "成功添加"
            success += 1
コード例 #31
0
ファイル: IndexFiles.py プロジェクト: elfdown/ee208
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.NONE)  # Not Indexed

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS
                           )  # Indexes documents, frequencies and positions.

        count = 0
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.shtml'):
                    continue

                try:
                    path = os.path.join(root, filename)
                    with open(path, 'r') as file:
                        contents = file.read()
                    soup = BeautifulSoup(contents, features="html.parser")
                    doc = Document()

                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", path, t1))

                    url = self.relation[filename]
                    doc.add(Field("url", url, t1))

                    if len(contents) > 0:

                        title = soup.find('title').text  #开始处理开头

                        content = "".join(soup.findAll(text=True))
                        content = jieba.lcut(content)
                        content = ' '.join(content)

                        doc.add(Field("title", title, t1))
                        doc.add(Field("contents", content, t2))

                    else:
                        doc.add(Field("title", "", t1))
                        doc.add(Field("contents", "", t2))

                        print("warning: no content in %s" % filename)
                    writer.addDocument(doc)
                except Exception as e:
                    print("Failed in indexDocs:", e)

                count += 1
                if (count % 100 == 0):
                    writer.commit()
                    print(count)
コード例 #32
0
def create_minidoc(termstring, field='text'):
    # To store term vectors (used for query expansion) we have to use a custom fieldtype
    customfield = FieldType()
    customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
    customfield.setStored(True)
    customfield.setTokenized(True)
    customfield.setStoreTermVectors(True)

    doc = Document()
    doc.add(Field(field, termstring, customfield))
    return doc
コード例 #33
0
ファイル: filesearch.py プロジェクト: bashwork/common
class Indexer(object):
    def __init__(self, **kwargs):
        """ Initialize a new instance of the Indexer

        :param output: The output directory of the underlying index
        :param anaylzer: The overloaded analyzer to work with
        """
        self.output = kwargs.get("root", "index")
        if not os.path.exists(self.output):
            os.mkdir(self.output)

        self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.store = SimpleFSDirectory(File(self.output))
        self.writer = IndexWriter(self.store, self.config)
        self.create_field_types()

    def index(self, document):
        """ Given a new document, add it to the index.

        :param document: The document to add to the indexer
        """
        try:
            self.writer.addDocument(document)
        except Exception:
            logger.exception("Failed to index the supplied document")

    def shutdown(self):
        """ Shutdown the currently processing indexer.
        """
        try:
            # self.writer.optimize()
            self.writer.close()
        except Exception:
            logger.exception("Failed to shutdown the indexer correctly")

    def create_field_types(self):
        """ Create the field types that will be used to specify
        what actions lucene should take on the various fields
        supplied to index.
        """
        self.field_clean = FieldType()
        self.field_clean.setIndexed(True)
        self.field_clean.setStored(True)
        self.field_clean.setTokenized(False)
        self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        self.field_dirty = FieldType()
        self.field_dirty.setIndexed(True)
        self.field_dirty.setStored(False)
        self.field_dirty.setTokenized(True)
        self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
コード例 #34
0
ファイル: UpdateFiles.py プロジェクト: ltzone/2019Fall
    def indexDocs(self, root, writer):

        t1 = FieldType() #t1 is used in URL fields
        t1.setIndexed(False)
        t1.setStored(True)
        t1.setTokenized(False)

        t2 = FieldType() #t2 is used to index contents
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t3 = FieldType() #t3 is used to index titles
        t3.setIndexed(True)
        t3.setStored(True)
        t3.setTokenized(True)
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)



        indextxt = open(self.filedir, 'r')


        while True:
            t = indextxt.readline()
            if (len(t) == 0):
                indextxt.close()
                return
            filename = t.strip()
#        for root, dirnames, filenames in os.walk(root):
#            for filename in filenames:
            print "updating", filename
            try:
                path = os.path.join(root, filename)
                file = open(path)
                title = file.readline()
                print title
                page_URL = file.readline()
                while True:
                    imgsrc = file.readline()
                    if (imgsrc == 'EOF'):
                        file.close()
                        break
                    contents = file.readline()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("imgurl", imgsrc, t1))
                    doc.add(Field("url", page_URL, t1))
                    doc.add(Field("title",title, t3))
                    doc.add(Field("contents", contents, t2))
                    writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e
コード例 #35
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        files = os.listdir(root)

        for filename in files:
            print "adding", filename
            try:
                path = os.path.join(root, filename)
                file = open(path)
                total = file.read()
                totallist=total.split("\n")

                geshou=totallist[0]
                geming=totallist[1]
                zhuanji=totallist[2]
                imgurl=totallist[3]
                liupai=totallist[4]
                shijian=totallist[5]
                jianjie=totallist[6]
                geci=totallist[7]


                contents = geming+geshou+zhuanji+liupai+geci
                seg_result = jieba.cut(contents)
                contents = ' '.join(seg_result)

                doc = Document() 
                doc.add(Field("contents",contents,t2))
                doc.add(Field("geming", geming, t1))
                doc.add(Field("geshou", geshou, t1))
                doc.add(Field("zhuanji", zhuanji, t1))
                doc.add(Field("liupai",liupai,t1))
                doc.add(Field("geci",geci,t1))
                doc.add(Field("imgurl", imgurl, t1))
                doc.add(Field("shijian",shijian,t1))
                doc.add(Field("jianjie",jianjie,t1))
                    
                writer.addDocument(doc)
                file.close()
            except Exception, e:
                print "Failed in indexDocs:", e
コード例 #36
0
    def indexDocs(self, root, writer):

        t1 = FieldType() #t1 is used in path and URL fields
        t1.setIndexed(False)
        t1.setStored(True)
        t1.setTokenized(False)

        t2 = FieldType() #t2 is used to index contents
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t3 = FieldType() #t3 is used to index titles
        t3.setIndexed(True)
        t3.setStored(True)
        t3.setTokenized(True)
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        indextxt = open(self.filedir, 'r')


        while True:
            t = indextxt.readline()
            if (len(t) == 0):
                indextxt.close()
                return
            t = t.split()
            filename = t[1]
            URL = t[0]
#        for root, dirnames, filenames in os.walk(root):
#            for filename in filenames:
            print "adding", filename
            try:
                path = os.path.join(root, filename)
                file = open(path)
                title = file.readline()
                print title
                contents = unicode(file.read())
                file.close()
                doc = Document()
                doc.add(Field("name", filename, t1))
                doc.add(Field("path", path, t1))
                doc.add(Field("url", URL, t1))
                doc.add(Field("title",title, t3))
                if len(contents) > 0:
                    doc.add(Field("contents", contents, t2))
                else:
                    print "warning: no content in %s" % filename
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e
コード例 #37
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                # if not filename.endswith('.txt'):
                # continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    pwd = os.path.join(os.getcwd() + '/' + root, filename)
                    file = open(path)
                    url = file.readline()
                    title = file.readline()
                    tag = file.readline()
                    imgurl = file.readline()
                    price = file.readline()
                    wellrate = file.readline()
                    comment = file.readline()

                    contents = unicode(file.read(), 'utf8')
                    file.close()
                    doc = Document()
                    doc.add(Field('url', url, t1))
                    doc.add(Field('title', title, t1))
                    doc.add(Field('imgurl', imgurl, t1))
                    doc.add(Field('price', price, t1))
                    doc.add(Field('wellrate', wellrate, t1))
                    doc.add(Field('comment', comment, t1))

                    if len(tag) > 2:
                        doc.add(Field('tag', tag, t2))
                    else:
                        doc.add(Field('tag', ' ', t2))
                    #doc.add(Field('comment', comment, t1))

                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
コード例 #38
0
    def indexDocs(self, dialog, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        f = open(dialog)
        lines = f.readlines()
        for line in lines:
            info = line.split('\t')
            url = info[0]
            filename = info[1]
            title = info[2]
            print "adding", filename
            try:
                path = os.path.join(root, filename)
                file = open(path)
                contents = unicode(file.read(), 'utf-8')
                file.close()
                doc = Document()
                doc.add(Field('name', filename, t1))
                doc.add(Field('path', path, t1))
                doc.add(
                    Field('url', url, Field.Store.YES,
                          Field.Index.NOT_ANALYZED))
                if len(contents) > 0:
                    soup = BeautifulSoup(contents)
                    try:
                        title = soup.title.text
                        doc.add(
                            Field('title', title, Field.Store.YES,
                                  Field.Index.ANALYZED))
                    except Exception, e:
                        doc.add(
                            Field('title', 'none', Field.Store.YES,
                                  Field.Index.ANALYZED))
                    contents = ''.join(soup.findAll(text=True))
                    doc.add(Field("contents", analysis(contents), t2))
                else:
                    print "warning: no content in %s" % filename
                writer.addDocument(doc)
コード例 #39
0
ファイル: indexing.py プロジェクト: andrely/vg-pipeline
def index_article(writer, art_id, art_body):
    art_id_field = FieldType()
    art_id_field.setIndexed(True)
    art_id_field.setStored(True)
    art_id_field.setTokenized(False)
    art_id_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

    art_body_field = FieldType()
    art_body_field.setIndexed(True)
    art_body_field.setStored(True)
    art_body_field.setTokenized(True)
    art_body_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    doc = Document()
    doc.add(Field("art_id", str(art_id), art_id_field))
    doc.add(Field("art_body", art_body, art_body_field))

    writer.addDocument(doc)
コード例 #40
0
ファイル: test_bug1842.py プロジェクト: svn2github/pylucene
    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer()

        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        ftype.setTokenized(True)
        ftype.setStoreTermVectors(True)
        ftype.freeze()

        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()
コード例 #41
0
def lazyImport():
    global imported
    if imported:
        return

    from meresco.pylucene import getJVM
    getJVM()

    from java.nio.file import Paths
    from org.apache.lucene.document import Document, StringField, Field, FieldType
    from org.apache.lucene.search import IndexSearcher, TermQuery
    from org.apache.lucene.index import DirectoryReader, Term, IndexWriter, IndexWriterConfig, IndexOptions
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from org.apache.lucene.analysis.core import WhitespaceAnalyzer

    UNINDEXED_TYPE = FieldType()
    UNINDEXED_TYPE.setIndexOptions(IndexOptions.NONE)
    UNINDEXED_TYPE.setStored(True)
    UNINDEXED_TYPE.setTokenized(False)

    imported = True
    globals().update(locals())
コード例 #42
0
ファイル: indexer.py プロジェクト: ouceduxzk/AI2-Kaggle
 def indexDocs(self, root, writer): 
      
     #Create a new FieldType with default properties. 
     t1 = FieldType() 
     t1.setIndexed(True) 
     t1.setStored(True) 
     t1.setTokenized(False)#True if this field's value should be analyzed by the Analyzer. 
     t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) 
      
     #Create a new FieldType with default properties. 
     t2 = FieldType() 
     t2.setIndexed(True) 
     t2.setStored(True) 
     t2.setTokenized(True)#True if this field's value should be analyzed by the Analyzer. 
     t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) 
      
     for root, dirnames, filenames in os.walk(root): 
         for filename in filenames: 
             if not filename.endswith('.txt'): 
                 continue 
             print 'adding', filename 
             try: 
                 path = os.path.join(root, filename) 
                 file = open(path) 
                 contents = file.read() 
                 file.close() 
                 doc = Document() 
                 doc.add(Field('name', filename, t1)) 
                 doc.add(Field('path', root, t1)) 
                 if len(contents) > 0: 
                     doc.add(Field('contents', contents, t2)) 
                     print 'length of content is %d'%(len(contents)) 
                 else: 
                     print 'warning: no content in %s' % filename 
                 writer.addDocument(doc) 
             except Exception, e: 
                 print 'Failed in indexDocs:', e 
コード例 #43
0
ファイル: indexer.py プロジェクト: zz-mars/simple-search
	def indexDocs(self,root,writer):
		t1 = FieldType()
		t1.setIndexed(True)
		t1.setStored(True)
		t1.setTokenized(True)
		t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

		t2 = FieldType()
		t2.setIndexed(True)
		t2.setStored(False)
		t2.setTokenized(True)
		t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

		for root, dirnames,filenames in os.walk(root):
			# traverse through the doc directory
			for filename in filenames:
				# only if this file ends with '.c'
				if not filename.endswith('.c'):
					continue
				try:
					# only add the filename and path for indexing
					path = os.path.join(root,filename)
					print "adding file : ",path
					file = open(path)
					contents = unicode(file.read(),'utf-8')
					file.close()
					doc = Document()
					doc.add(Field("name",filename,t1))
					doc.add(Field("path",root,t1))
				#	if len(contents) > 0:
				#		doc.add(Field("contents",contents,t2))
				#	else:
				#		print "warning: no content in ",filename
					writer.addDocument(doc)
				except Exception,e:
					print "failed in indexDocs:",e
コード例 #44
0
ファイル: WikiIndex.py プロジェクト: alvations/Wikicorpus
def index_wiki(wiki_xmlfile, index_directory_name):
    lucene.initVM()
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    
    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)
     
    writer.commit()
    writer.close()
コード例 #45
0
    def indexDocs(self, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)



        for folder in os.listdir(rawDir):
            for fileName in os.listdir(rawDir+'/'+folder):
                if not fileName.endswith('.json'):
                    continue


                print fileName
                
                subject_id = fileName.split('_')[0]
                
                # print rawDict
                print 'id:'+subject_id
                print 'folder:'+str(folder)

                rawPath = rawDir+'/' +folder +'/'+subject_id+'_res.json'
                adjPath = adjDir+'/' +folder +'/'+subject_id+'_adj.json'
                #tfidfPath = tf_idfDir + '/' + subject_id+'_tfidf.json'

                print adjPath

                rawFile = open(rawPath,'r')
                raw = rawFile.read()
                if raw =='':
                    with open(baseDir+'/'+'err_no_raw_content.txt','a') as err:
                        err.write(subject_id+'\n')
                # if subject_id =='6018943':
                #     rawFile.seek(0)

                rawFile.close()
                adjFile = open(adjPath,'r')
                adj = adjFile.read()

                adjFile.close()

                raw = getRidOfBOM(raw)
                adj = getRidOfBOM(adj)

                if raw != '':
                    rawDict = json.loads(raw)
                adjDict = json.loads(adj)

                rawAll = rawDict['summary'] +' '+ rawDict['user_tags'] + ' '+rawDict['comments']
                summary_adjs = adjDict['summary_adjs']
                comments_adjs = adjDict['comments_adjs']
                title = adjDict['title']
                rating_average = adjDict['rating_average']
                comments_count = adjDict['comments_count']

                doc = Document()
                doc.add(Field("folder",folder,t1))
                doc.add(Field("title", title, t1))
                doc.add(Field("subject_id", subject_id, t1))
                doc.add(IntField("comments_count", comments_count, Field.Store.YES))
                doc.add(FloatField("rating_average", rating_average, Field.Store.YES))


                if len(summary_adjs)>0:
                    print summary_adjs
                    exit()
                doc.add(Field("summary_adjs", summary_adjs, t2))

                #if len(comments_adjs)>0:
                doc.add(Field("comments_adjs", comments_adjs, t2))

                writer.addDocument(doc)
    def tweetIndexer(self, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        x = 0
        for i in range(0,500):
            if not os.path.isfile("json/tweets-" + str(i) + ".json"):
                break

            print "adding tweets-" + str(i) + ".json"
            tweets = open("json/tweets-" + str(i) + ".json", "r")

            for line in tweets.readlines():
                tweet = json.loads(line)
                if 'limit' in tweet:
                    continue
                try:
                    doc = Document()
                    doc.add(Field("file", "json/tweets-" + str(i) + ".json", t1))
                    sname = tweet['user']['screen_name']
                    tid = str(tweet['id'])
                    text = tweet['text']
                    uname = tweet['user']['name']
                    created = tweet['created_at']
                    tstamp = tweet['timestamp_ms']
                    place = ""
                    if tweet['place']:
                        place = tweet['place']['full_name'] + ", " + tweet['place']['country']
                    lat = ""
                    lng = ""
                    titles = ""
                    urls = ""
                    exist = "false"

                    if tweet['coordinates']:
                        lat = str(tweet['coordinates']['coordinates'][1])
                        lng = str(tweet['coordinates']['coordinates'][0])
                    else:
                        lat = str((tweet['place']['bounding_box']['coordinates'][0][0][1] + tweet['place']['bounding_box']['coordinates'][0][2][1])/2)
                        lng = str((tweet['place']['bounding_box']['coordinates'][0][0][0] + tweet['place']['bounding_box']['coordinates'][0][2][0])/2)
                    
                    if len(tweet['entities']['urls']) != 0:
                        exist = "true"
                        for index in range(len(tweet['entities']['urls'])):
                            title = tweet['entities']['urls'][index]['url_title']
                            if title == None:
                                titles += ",-"
                            else:
                                title = title.encode('ascii','ignore')
                                titles += "," + str(title)
                            urls += " " + str(tweet['entities']['urls'][index]['expanded_url'])


                    searchable = text + " " + urls + " " + uname + " " + sname + " " + place
                    doc.add(Field("lookup", searchable, t2))
                    doc.add(Field("text", text, t2))
                    doc.add(Field("user_name", uname, t2)) 
                    doc.add(Field("screen_name", sname, t2))                    
                    doc.add(Field("tweet_id", tid, t2))
                    doc.add(Field("created_at", created, t2))
                    doc.add(Field("geo_lat", lat, t2))
                    doc.add(Field("geo_lng", lng, t2))
                    doc.add(Field("url_exist", exist, t2))
                    doc.add(Field("url_url", urls, t2))
                    doc.add(Field("url_title", titles, t2))
                    doc.add(Field("timestamp", tstamp, t2))
                    writer.addDocument(doc)
                    x += 1
                except Exception, e:
                    pass
            tweets.close()
コード例 #47
0
	def __init__(self):

		self.mDocumentDirectory = settings.ADMINS_ENGINE.mDocumentDirectory
		self.mIndexDirectory = settings.ADMINS_ENGINE.mIndexDirectory
		self.mAnalyzers = settings.ADMINS_ENGINE.getIndexingAnalyzers()


		############################# Writer Configurattion #####################################
		map = HashMap()
		map.put('name', self.mAnalyzers['name'])
		map.put('parent', self.mAnalyzers['parent'])
		map.put('content', self.mAnalyzers['default'])
		map.put('id', self.mAnalyzers['id'])		

		analyzerWrapper = PerFieldAnalyzerWrapper(self.mAnalyzers['default'], map)

		self.mWriterConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzerWrapper)
		self.mWriterConfig.setOpenMode(settings.ADMINS_ENGINE.mOpenMode)

		if settings.ADMINS_ENGINE.mSimilarity != None:
			self.mWriterConfig.setSimilarity(settings.ADMINS_ENGINE.mSimilarity)
		########################################################################################


		directory = SimpleFSDirectory(File(self.mIndexDirectory))
		self.mIndexWriter = IndexWriter(directory, self.mWriterConfig)


		############################# FieldType Prepration #####################
		nameField = FieldType()
		nameField.setIndexed(True)
		nameField.setStored(True)
		nameField.setTokenized(True)
		nameField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)

		parentField = FieldType()
		parentField.setIndexed(True)
		parentField.setStored(True)
		parentField.setTokenized(True)
		parentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)

		contentField = FieldType()
		contentField.setIndexed(True)
		contentField.setStored(True)
		contentField.setTokenized(True)
		contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

		idField = FieldType()
		idField.setIndexed(True)
		idField.setStored(True)
		idField.setTokenized(False)
		idField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)


		self.mFieldTypes = {
			'name' 		: nameField,
			'parent'	: parentField,
			'content'	: contentField,
			'id'		: idField
		}
		#######################################################################

		self.mLog = ""
コード例 #48
0
ファイル: index.py プロジェクト: erfannoury/pyLInSea
def indexCranFull(path, writer):
    """
    This method reads in the cran.1400 file and creates an index out of it.

    These fields are used when storing documents:
        title: title of the document (in a line starting with .T)
        author: author of the document (in a line starting with .A)
        source: where this article has been published, not yet used (in a line starting with .B)
        content: body text of the article (in a line starting with .W)
    """
    # Title field type
    tft = FieldType()
    tft.setIndexed(True)
    tft.setStored(True)
    tft.setTokenized(TokenizeFields)
    tft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #only index the document and frequency data


    # Author field type
    aft = FieldType()
    aft.setIndexed(True)
    aft.setStored(True)
    aft.setTokenized(TokenizeFields)
    aft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) #index the document, term frequency and position data


    # Content field type
    cft = FieldType()
    cft.setIndexed(True)
    cft.setStored(True)
    cft.setTokenized(True)
    cft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    txt = open(path)
    """
    cran documents are listed in this order:
    .I # -> ignore
    .T -> starting on the next line, multi-line title
    .A -> on the next line, multiple authored separated by 'and'
    .B -> on the next line, ignore
    .W -> starting on the next line, multi-line body (content)
    """
    docid = 0
    debug = False
    while True: # in each iteration, read all the lines corresponding to a document
        line = txt.readline()

        if line == '':
            break

        if docid == 1400:
            debug = True

        if line.startswith('.I'):
            docid = int(line.split(' ')[1].strip())
            continue

        if line.startswith('.T'):
            title = ''
            while True:
                line = txt.readline()
                if line.startswith('.A'):
                    break
                title = ' '.join([title, line]).strip()

            line = txt.readline()
            # authors = ' '.join(line.split('and'))
            authors = line.strip()

            #.B, its corresponding line and .W
            txt.readline()
            txt.readline()
            txt.readline()
            body = ''
            while True:
                line = txt.readline()
                if line.startswith('.I'):
                    docid = int(line.split(' ')[1].strip())
                    break
                if line == '':
                    break
                body = ' '.join([body, line]).strip()

            doc = Document()
            doc.add(Field(title_field, title, tft))
            doc.add(Field(author_field, authors, aft))
            doc.add(Field(content_field, body, cft))
            doc.add(IntField(docid_field, docid, Field.Store.YES))
            writer.addDocument(doc)
コード例 #49
0
    IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions

if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

directory = RAMDirectory()
iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100))
iwriter = IndexWriter(directory, iconfig)

ft = FieldType()
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

ts = ["this bernhard is the text to be index text",
      "this claudia is the text to be indexed"]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t, ft))
    iwriter.addDocument(doc)

iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)

for doc in xrange(0, len(ts)):
    tv = ireader.getTermVector(doc, "fieldname")
    termsEnum = tv.iterator()
コード例 #50
0
    def indexDocs(self, root, writer):

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                print "adding", filename
                doc_parser = HTMLDocumentParser()
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'iso-8859-1')
                    doc_parser.feed(contents)
                    contents = doc_parser.contents
                    html_doc = HTMLDocument(contents)

                    flag = False
                    if flag:
                        print '=============='
                        print 'Title: ' + html_doc.title
                        print 'Description: ' + html_doc.description
                        print 'Month: ' + html_doc.month
                        print 'Year: ' + html_doc.year
                        print 'Authors: ' + str(html_doc.authors)
                        print 'Keywords: ' + str(html_doc.keywords)
                        print 'Timestamp: ' + str(html_doc.timestamp)
                        print ' '

                    file.close()

                    doc = Document()

                    field_filename = FieldType()
                    field_filename.setIndexed(True)
                    field_filename.setStored(True)
                    field_filename.setTokenized(False)
                    field_filename.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)        
                    doc.add(Field("filename", filename.replace('.html', ''), field_filename))

                    field_path = FieldType()
                    field_path.setIndexed(True)
                    field_path.setStored(True)
                    field_path.setTokenized(True)
                    field_path.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
                    doc.add(Field("path", root, field_path))
                    
                    field_title = FieldType()
                    field_title.setIndexed(True)
                    field_title.setStored(True)
                    field_title.setTokenized(True)
                    field_title.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)        
                    doc.add(Field("title", html_doc.title, field_title))

                    field_description = FieldType()
                    if html_doc.has_description():
                        field_description.setIndexed(True)
                    else:
                        field_description.setIndexed(True)
                    field_description.setStored(True)
                    field_description.setTokenized(True)
                    field_description.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)        
                    doc.add(Field("description", html_doc.description, field_description))

                    field_month = FieldType()
                    field_month.setIndexed(True)
                    field_month.setStored(True)
                    field_month.setTokenized(False)
                    field_month.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)        
                    doc.add(Field("month", html_doc.month, field_month))

                    field_year = FieldType()
                    field_year.setIndexed(True)
                    field_year.setStored(True)
                    field_year.setTokenized(False)
                    field_year.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)        
                    doc.add(Field("year", html_doc.year, field_year))

                    if html_doc.has_authors():
                        field_author = FieldType()
                        field_author.setIndexed(True)
                        field_author.setStored(True)
                        field_author.setTokenized(True)
                        field_author.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) 
                        for author in html_doc.authors:
                            doc.add(Field("author", author, field_author))

                    if html_doc.has_keywords():
                        field_keyword = FieldType()
                        field_keyword.setIndexed(True)
                        field_keyword.setStored(True)
                        field_keyword.setTokenized(True)
                        field_keyword.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) 
                        for keyword in html_doc.keywords:
                            doc.add(Field("keyword", keyword, field_keyword))

                    field_timestamp = FieldType()
                    field_timestamp.setIndexed(False)
                    field_timestamp.setStored(True)
                    field_timestamp.setTokenized(False)
                    field_timestamp.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)        
                    doc.add(Field("timestamp", html_doc.timestamp, field_timestamp))

                    if len(contents) > 0:
                        field_source = FieldType()
                        field_source.setIndexed(True)
                        field_source.setStored(True)
                        field_source.setTokenized(True)
                        field_source.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) 
                        doc.add(Field("contents", contents, field_source))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
コード例 #51
0
    def indexTable(self, writer):

        #connection 
        con = None

        #define the index of all the fields
        #---------step 2----------
        con = mdb.connect('localhost','root','testgce','moviedata')

        #t_num = FieldType.NumericType it is wrong!!
        t_num = FieldType()
        t_num.setStored(False)

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t3 = FieldType()
        t3.setIndexed(True)
        t3.setStored(True)
        t3.setTokenized(True)
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        with con:
            # Careful with codecs
            con.set_character_set('utf8')

            cur = con.cursor()
            # Aagin the codecs
            cur.execute('SET NAMES utf8;')
            cur.execute('SET CHARACTER SET utf8;')
            cur.execute('SET character_set_connection=utf8;')
            
            #------step 3------
            cur.execute("SELECT * FROM movie_items")

            numrows = int(cur.rowcount)
            print 'numrows:',numrows
            for i in range(numrows):
                row = cur.fetchone()

                #------step 4------
                summary = row[SUMMARY]  
                subject_id = row[SUBJECT_ID]


                print 'id'+subject_id
                #print 'summary'+summary+'end'

                doc = Document()
                #fields which should not be analyzed
                doc.add(FloatField("rating_average",float(row[RATING_AVERAGE]),Field.Store.NO))
                doc.add(FloatField("rating_stars", float(row[RATING_STARS]), Field.Store.NO))
                doc.add(IntField("reviews_count", int(row[REVIEWS_COUNT]), Field.Store.NO))
                #doc.add(FloatField("year", float(row[YEAR]), Field.Store.NO))
                doc.add(IntField("collect_count", int(row[COLLECT_COUNT]), Field.Store.NO))
                doc.add(IntField("subject_id", int(subject_id), Field.Store.YES))
                doc.add(IntField("comments_count", int(row[COMMENTS_COUNT]), Field.Store.NO))
                doc.add(IntField("ratings_count", int(row[RATINGS_COUNT]), Field.Store.NO))
                doc.add(Field("image_small", row[IMAGE_SMALL], t1))

                #fields which should be analyzed with WhitespaceAnalyzer
                doc.add(Field("countries", row[COUNTRIES].replace(delim,' '), t3))
                doc.add(Field("casts",     row[CASTS].replace(delim,' '),     t3))
                doc.add(Field("genres",    row[GENRES].replace(delim,' '),    t3))
                doc.add(Field("subtype",   row[SUBTYPE].replace(delim,' '),   t2))
                doc.add(Field("directors", row[DIRECTORS].replace(delim,' '), t3))

                user_tags_str = ''
                others_like_str = ''
                
                # print 'user_tags'+row[USER_TAGS]
                # print 'others_like'+row[OTHERS_LIKE]
                
                if row[USER_TAGS]!='':
                    for tag_pair in row[USER_TAGS].split(delim):
                        if tag_pair!='':#字符串的最后一个字符是:,这样split之后最后一个元素是空字符
                            user_tags_str = user_tags_str +' '+tag_pair.split(delim_uo)[0]
                if row[OTHERS_LIKE]!='':
                    for like_pair in row[OTHERS_LIKE].split(delim):
                        if like_pair!='':
                            others_like_str = others_like_str +' '+like_pair.split(delim_uo)[1]

                # print user_tags_str
                # print others_like_str


                doc.add(Field("user_tags", user_tags_str, t3))
                doc.add(Field("others_like", others_like_str, t3))

                #fields which should be analyzed with good analyzer
                doc.add(Field("title", row[TITLE], t3))                
                doc.add(Field("original_title", row[ORIGINAL_TITLE], t2))
                doc.add(Field("summary_segmentation", row[SUMMARY_SEGMENTATION], t2))
                doc.add(Field("aka", row[AKA], t2))

                if len(summary) > 0:
                    print subject_id +'--->'+':\n    '+ row[TITLE]
                    try:
                        summary_unicoded = unicode(summary, 'utf-8') #test the encoding 
                    except Exception,e:
                        print "Decode Failed: ", e
                    doc.add(Field('summary', summary, t2))
                else:
                    print "warning:\n" + subject_id +'---> No content!'
                writer.addDocument(doc)
コード例 #52
0
	def reindex(self):
		''' Re-indexes the entire database into Index file'''
		start = time.time()

		# get all posts
		posts = self._tuples_to_dict(self._fetch_all_questions(), self._posts_fields)
		if not posts:
			raise Exception("FATAL Error: Could not fetch posts from Database")

		# open indexer
		# lucene.initVM(vmargs=['-Djava.awt.headless=true'])
		# print 'lucene', lucene.VERSION

		store = SimpleFSDirectory(File(self.index_dir))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
		writer = IndexWriter(store, config)

		indexedField = FieldType()
		indexedField.setIndexed(True)
		indexedField.setStored(True)
		indexedField.setTokenized(True)
		indexedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

		storedField = FieldType()
		storedField.setIndexed(False)
		storedField.setStored(True)
		storedField.setTokenized(False)
		storedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

		fieldTypes = {
						'type'		: storedField,
						'id'		: storedField,
						'title'		: indexedField,
						'question'	: indexedField,
						'answer'	: indexedField,
						# 'comment'	: indexedField,
						'tag'		: indexedField,
						'extra'		: indexedField,
		}

		# get their comments
		num_docs = 0
		for post in posts:
			if self.status_mode: print "\r {0:.2f} %complete".format(((num_docs/142627.0)*100)),
			if self.debug : print "\n","*"*20,"\nIndexing post: ", post['id'], "from ", post['extra']
			if self.debug and self.verbose_values: print post
			answers = self._tuples_to_dict(self._fetch_all_answers(post['id'], post['extra']), self._answer_fields)


			# add comment field
			for answer in answers:
				num_docs += 1
				if self.debug: print "\n","+"*10, "\nMaking new Document"
				doc = Document()
				if self.debug: print "Adding doc type"
				doc.add(Field("type", self.doctype, fieldTypes['type']))
				
				# make fields
				if self.debug: print "Adding post fields"
				for i in xrange(len(self._posts_fields)):
					f = Field(self._posts_fields[i], self._cleanup_tag(post[self._posts_fields[i]]), fieldTypes[self._posts_fields[i]])
					f.setBoost(self._fields_boost[self._posts_fields[i]])
					doc.add(f)


				if self.status_mode: print "\t Indexing answer: ", answer['answer_id']
				if self.debug and self.verbose_values: print answer
				# answered_doc = copy.deepcopy(doc)
				# make comment field
				f = Field("answer", self._cleanup_tag(answer['answer']), fieldTypes['answer'])
				f.setBoost(self._fields_boost['answer'])
				doc.add(f)
				# calculate paths
				# commented_doc = copy.deepcopy(answered_doc)
				# comments = self._comments_to_comment_string(self._tuples_to_dict(self._fetch_all_comments(answer['id']), self._comment_fields))

				# if self.debug: print "\t\tAdding comments: ", comments
				# commented_doc.add(Field("comment", self._cleanup_tag(comments), fieldTypes['comment']))

				# write index
				if self.debug: print "\tAdding document {doc_id} to index".format(doc_id=post['id'])
				writer.addDocument(doc)

				# del answered_doc
				# del commented_doc

			if self.debug: print "Commiting document to index"
			writer.commit()

		# close index
		if self.status_mode: print "Closing index write"
		writer.close()
		end = time.time() - start

		if self.status_mode: print "\n","-"*20, \
			"\nTotal time spent in indexing: ", end, "seconds" \
			"\nIndexed {num_docs} documents".format(num_docs=num_docs)
コード例 #53
0
    def indexTable(self, writer):

        #connection 
        con = None

        #define the index of all the fields
        #---------step 2:connect to mysql----------
        con = mdb.connect('localhost','root','testgce','douban_movie_v3')

        #t_num = FieldType.NumericType it is wrong!!
        t_num = FieldType()
        t_num.setStored(False)

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t3 = FieldType()
        t3.setIndexed(True)
        t3.setStored(True)
        t3.setTokenized(True)
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        maxDict = utils.maxDict
        #加权数值范围
        base = DOC_BOOST_RANGE[0]
        upper = DOC_BOOST_RANGE[1]

        with con:
            # Careful with codecs
            con.set_character_set('utf8')

            cur = con.cursor()
            # Aagin the codecs
            cur.execute('SET NAMES utf8;')
            cur.execute('SET CHARACTER SET utf8;')
            cur.execute('SET character_set_connection=utf8;')
            
            #------step 3: choose the right table------
            cur.execute("SELECT * FROM movie_items")

            numrows = int(cur.rowcount)
            print 'numrows:',numrows
            for i in range(numrows):
                print
                row = cur.fetchone()

                #------step 4:Index your field------
                summary = row[SUMMARY]  
                subject_id = row[SUBJECT_ID]


                print 'id'+subject_id
                year = utils.formatYear(row[YEAR])
                try:
                    date = DateTools.stringToDate(year.replace('-',' '))
                    wtfFile = open('wtf.txt','a')
                    dateStr  = DateTools.dateToString(date,DateTools.Resolution.DAY)
                except:
                    wtfFile.write(year+'\n')

                        

                doc = Document()

                #boosting
                boostProb = utils.calcBoostProb(row,maxDict,dateStr)
                boost = base + boostProb*(upper-base)

                doc.add(FloatField("boost",boost,Field.Store.YES))
                doc.add(StringField("year",dateStr,Field.Store.YES))
                print 'dateStr:'+dateStr
                #A text field is a sequence of terms that has been tokenized while a string field is a single term (although it can also be multivalued.)

                do_count = row[DO_COUNT] if row[DO_COUNT] != None else 0
                wish_count = row[COLLECT_COUNT] if row[WISH_COUNT] != None else 0

                #fields which should not be analyzed
                doc.add(FloatField("rating_average",float(row[RATING_AVERAGE]),Field.Store.YES))
                doc.add(FloatField("rating_stars", float(row[RATING_STARS]), Field.Store.YES))
                doc.add(IntField("reviews_count", int(row[REVIEWS_COUNT]), Field.Store.YES))
                #doc.add(FloatField("year", float(row[YEAR]), Field.Store.YES).setBoost(boost))
                doc.add(IntField("collect_count", int(row[COLLECT_COUNT]), Field.Store.YES))
                doc.add(IntField("do_count", int(do_count), Field.Store.YES))
                doc.add(IntField("wish_count", int(wish_count), Field.Store.YES))
                doc.add(IntField("subject_id", int(row[SUBJECT_ID]), Field.Store.YES))
                doc.add(IntField("comments_count", int(row[COMMENTS_COUNT]), Field.Store.YES))
                doc.add(IntField("ratings_count", int(row[RATINGS_COUNT]), Field.Store.YES))
                doc.add(StringField("image_small", row[IMAGE_SMALL], Field.Store.YES))

                #fields which should be analyzed with WhitespaceAnalyzer
                #attention!!! dont use a long sentence like :
                #doc.add(Field("genres",    row[GENRES].replace(delim,' '),    t3).setBoost(boost))
                #or you'll get a null pointer error
                f = Field("countries", row[COUNTRIES].replace(delim,' '), t3)
                f.setBoost(boost)
                doc.add(f)

                #process casts
                raw_casts = row[CASTS].replace(delim,' ')
                f = Field("raw_casts", raw_casts , t1)
                f.setBoost(boost)
                doc.add(f)

                #将英文人名中的 ·
                raw_casts = raw_casts.replace('·',' ')
                
                if len(raw_casts.split(' '))<CASTS_LEN:
                    #平局人名长度是4
                    casts = raw_casts + ' ¥¥¥¥'*(CASTS_LEN-len(raw_casts.split(' ')))
                f = Field("casts", casts , t3)
                f.setBoost(boost)
                doc.add(f)

                #process directors
                raw_directors = row[DIRECTORS].replace(delim,' ')
                f = Field("raw_directors",raw_directors, t1)
                f.setBoost(boost)
                doc.add(f)

                #将英文人名中的 · 替换
                raw_directors = raw_directors.replace('·',' ')

                if len(raw_directors.split(' '))<DIRECTORS_LEN:
                    #平局人名长度是4
                    directors = raw_directors + ' ¥¥¥¥'*(DIRECTORS_LEN-len(raw_directors.split(' ')))
                f = Field("directors", directors, t3)
                f.setBoost(boost)
                doc.add(f)

                Field("genres",    row[GENRES].replace(delim,' '),    t3)
                f.setBoost(boost)
                doc.add(f)

                Field("subtype",   row[SUBTYPE].replace(delim,' '),   t3)
                f.setBoost(boost)
                doc.add(f)

                #it is wrong cause indexable field has no method setBoost
                # fieldList = doc.getFields()  # is not a python 'list' , but a 'List' which is unindexable                
                # for eachField in fieldList:
                #     eachField.setBoost(boost)


                #user_tags 原始字符串要存,reRank要用:
                doc.add(StringField("raw_user_tags",row[USER_TAGS],Field.Store.YES))
                doc.add(StringField("raw_others_like",row[OTHERS_LIKE],Field.Store.YES))
                

                user_tags_str = ''
                others_like_str = ''
                tags_len = 0
                

                if row[USER_TAGS]!='':
                    user_tags_list = row[USER_TAGS].split(delim) 
                    for tag_pair in user_tags_list:
                        if tag_pair!='':#字符串的最后一个字符是¥,这样split之后最后一个元素是空字符
                            #print 'tag_pair'+tag_pair+'hhe'
                            tag_name = tag_pair.split(delim_uo)[0]+' ' # dont forget this space !!
                            tag_num = tag_pair.split(delim_uo)[1]
                            tag_num_processed = int(int(tag_num)/TAG_SPAN)+1 #最小为1
                            #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                            user_tags_str = user_tags_str +' '+ tag_name * tag_num_processed
                            tags_len = tags_len + tag_num_processed #最后得到总共词的个数


                if tags_len<TAGS_AVER_LEN:
                    #填充tags,目测3是平均长度,所以使用 ¥¥¥
                    user_tags_str = user_tags_str +' ¥¥¥'*(TAGS_AVER_LEN - tags_len)
                #


                if row[OTHERS_LIKE]!='':
                    for like_pair in row[OTHERS_LIKE].split(delim):
                        if like_pair!='':
                            others_like_str = others_like_str +' '+like_pair.split(delim_uo)[1]


                #start process adjs
                if row[ADJS] != None:
                    raw_adjs = row[ADJS][:-1]

                    adjs_str = ''
                    adjs_len = 0
                    if row[ADJS] != '' and row[ADJS] != '\n':
                        #'重要=4.0,特殊=4.0'
                        adjs_str = row[ADJS]
                        adjs_list = adjs_str.split(',')
                        for adj_pair in adjs_list:
                            #print 'adj_pair:'+adj_pair+'hhe'
                            adj_name = adj_pair.split('=')[0]
                            adj_num = adj_pair.split('=')[1]

                            #去换行符,转换int
                            if adj_num[-1] == '\n':
                                adj_num = adj_num[0:-1]
                            adj_num = int(float(adj_num))

                            add_adj=''
                            # #同义词
                            # adj_name_bro = searchDictValue(adjMap,adj_name)
                            # if adj_name_bro == -1: #表示没有结果,即未找到近义词,不添加
                            #     add_adj = ''
                            # else:
                            #     add_adj = (adj_name_bro+' ')*adj_num
                            #     raw_adjs = raw_adjs + ',' + adj_name_bro+'='+str(adj_num)
                                
                            adjs_str = adjs_str + ' ' + (adj_name+' ') * adj_num +add_adj
                            adjs_len = adjs_len + adj_num #最后得到总共tags的个数

                    #print raw_adjs
                    doc.add(StringField("raw_adjs",raw_adjs,Field.Store.YES))

                    if adjs_len<ADJS_AVER_LEN:
                        #填充 adjs_str,目测2是平均长度,所以使用 "¥¥"
                        adjs_str = adjs_str +' ¥¥'*(ADJS_AVER_LEN - adjs_len)

                    f = Field("adjs", adjs_str, t3)
                    f.setBoost(boost)
                    doc.add(f)

                f = Field("user_tags", user_tags_str, t3)
                f.setBoost(boost)
                doc.add(f)

                f = Field("others_like", others_like_str, t3)
                f.setBoost(boost)
                doc.add(f)



                #fields which should be analyzed with good analyzer
                f = Field("title", row[TITLE], t3)                
                f.setBoost(boost)
                doc.add(f)

                f = Field("original_title", row[ORIGINAL_TITLE], t3)
                f.setBoost(boost)
                doc.add(f)

                f = Field("summary_segmentation", row[SUMMARY_SEGMENTATION], t3)
                f.setBoost(boost)
                doc.add(f)

                f = Field("aka", row[AKA], t2)
                f.setBoost(boost)
                doc.add(f)

                if len(summary) > 0:
                    print subject_id +'--->'+':\n    '+ row[TITLE]
                    try:
                        summary_unicoded = unicode(summary, 'utf-8') #test the encoding 
                    except Exception,e:
                        print "Decode Failed: ", e
                    f = Field('summary', summary, t2)
                    f.setBoost(boost)
                    doc.add(f)
                else:
                    print "warning:\n" + subject_id +'---> No content!'
                print 'boosting:' + str(boost)

                #for debug
                if boost>upper:
                    print boostProb
                    print maxDict
                    
                    exit(0)

                writer.addDocument(doc)
コード例 #54
0
ファイル: index.py プロジェクト: wasw100/jekyll-search
def indexDocs(root, writer):
        """
        indexed: name title content
        stored: date name tilte sumary
        :param root:
        :param writer:
        :return:
        """
        #index and store
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        #only index, but not store
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        #only store
        t3 = FieldType()
        t3.setIndexed(False)
        t3.setStored(True)
        t3.setTokenized(False)
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        for root, dirnames, filenames in os.walk(root):
            print filenames
            for filename in filenames:
                if not filename.endswith('.md'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'utf-8')
                    file.close()

                    date, name = get_date_name(filename)
                    title, content = get_post_title_content(contents)
                    summary = content[:200] if content else ''

                    print date, name, title

                    doc = Document()
                    doc.add(Field('date', date, t3))
                    doc.add(Field('name', name, t1))
                    doc.add(Field('title', title, t1))
                    doc.add(Field('content', content, t2))
                    doc.add(Field('summary', summary, t3))


                    # doc.add(Field("name", filename, t1))
                    # doc.add(Field("path", root, t1))
                    # if len(contents) > 0:
                    #     doc.add(Field("contents", contents, t2))
                    # else:
                    #     print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e