def index_docs(self, train_set, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPayloads(True) t2.setStoreTermVectorPositions(True) t2.setStoreTermVectors(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for ii in train_set: doc = Document() doc.add(Field("answer", ii['Answer'], t1)) doc.add(Field("qid", ii['Question ID'], t1)) doc.add(Field("category", ii['category'], t1)) doc.add(Field("position", ii['Sentence Position'], t1)) doc.add(Field("question", ii['Question Text'], t2)) doc.add(Field("wiki_plain", self.wiki_reader.get_text(ii['Answer']), t2)) writer.addDocument(doc)
def index_docs(self, tweets, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t1.setStoreTermVectors(True) t1.setStoreTermVectorOffsets(True) # add each tweet to the index for tweet in tweets: try: # strip out URLs because they provide false index matches contents = [] for word in tweet[1].text.split(): if word.startswith("http://") or word.startswith("https://"): continue contents.append(word) contents = " ".join(contents) if len(contents) == 0: continue doc = Document() doc.add(Field("contents", contents, t1)) writer.addDocument(doc) except Exception, e: print "Failed in index_docs:", e
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setStoreTermVectors(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPositions(True) t2.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) file_path = root + 'r52-train-all-terms.txt' fd = open(file_path) contents = fd.readlines() fd.close() contents_list = [x.strip() for x in contents] for i in xrange(len(contents_list)): try: [topic, content] = contents_list[i].split('\t') doc = Document() doc.add(Field("id", str(i), t1)) doc.add(Field("topic", topic, t1)) doc.add(Field("contents", content, t2)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPayloads(True) t2.setStoreTermVectorPositions(True) t2.setStoreTermVectors(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.json'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = json.load(file) file.close() doc = Document() doc.add(Field("title", contents['title'], t2)) doc.add(Field("path", root, t1)) doc.add(Field("playStoreURL", contents['playStoreURL'], t1)) doc.add(Field("creator", contents['creator'], t1)) extendedInfo = contents['extendedInfo'] if len(extendedInfo['description']) > 0: doc.add( Field("description", extendedInfo['description'], t2)) else: print "warning: no description in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def tweetIndexer(self, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPayloads(True) t2.setStoreTermVectorPositions(True) t2.setStoreTermVectors(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) x = 0 for i in range(0,500): if not os.path.isfile("json/tweets-" + str(i) + ".json"): break print "adding tweets-" + str(i) + ".json" tweets = open("json/tweets-" + str(i) + ".json", "r") for line in tweets.readlines(): tweet = json.loads(line) if 'limit' in tweet: continue try: doc = Document() doc.add(Field("file", "json/tweets-" + str(i) + ".json", t1)) sname = tweet['user']['screen_name'] tid = str(tweet['id']) text = tweet['text'] uname = tweet['user']['name'] created = tweet['created_at'] tstamp = tweet['timestamp_ms'] place = "" if tweet['place']: place = tweet['place']['full_name'] + ", " + tweet['place']['country'] lat = "" lng = "" titles = "" urls = "" exist = "false" if tweet['coordinates']: lat = str(tweet['coordinates']['coordinates'][1]) lng = str(tweet['coordinates']['coordinates'][0]) else: lat = str((tweet['place']['bounding_box']['coordinates'][0][0][1] + tweet['place']['bounding_box']['coordinates'][0][2][1])/2) lng = str((tweet['place']['bounding_box']['coordinates'][0][0][0] + tweet['place']['bounding_box']['coordinates'][0][2][0])/2) if len(tweet['entities']['urls']) != 0: exist = "true" for index in range(len(tweet['entities']['urls'])): title = tweet['entities']['urls'][index]['url_title'] if title == None: titles += ",-" else: title = title.encode('ascii','ignore') titles += "," + str(title) urls += " " + str(tweet['entities']['urls'][index]['expanded_url']) searchable = text + " " + urls + " " + uname + " " + sname + " " + place doc.add(Field("lookup", searchable, t2)) doc.add(Field("text", text, t2)) doc.add(Field("user_name", uname, t2)) doc.add(Field("screen_name", sname, t2)) doc.add(Field("tweet_id", tid, t2)) doc.add(Field("created_at", created, t2)) doc.add(Field("geo_lat", lat, t2)) doc.add(Field("geo_lng", lng, t2)) doc.add(Field("url_exist", exist, t2)) doc.add(Field("url_url", urls, t2)) doc.add(Field("url_title", titles, t2)) doc.add(Field("timestamp", tstamp, t2)) writer.addDocument(doc) x += 1 except Exception, e: pass tweets.close()
from org.apache.lucene.util import BytesRef, BytesRefIterator from org.apache.lucene.index import \ IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = RAMDirectory() iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100)) iwriter = IndexWriter(directory, iconfig) ft = FieldType() ft.setStored(True) ft.setTokenized(True) ft.setStoreTermVectors(True) ft.setStoreTermVectorOffsets(True) ft.setStoreTermVectorPositions(True) ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) ts = [ "this bernhard is the text to be index text", "this claudia is the text to be indexed" ] for t in ts: doc = Document() doc.add(Field("fieldname", t, ft)) iwriter.addDocument(doc) iwriter.commit() iwriter.close() ireader = DirectoryReader.open(directory)
from org.apache.lucene.util import BytesRef, BytesRefIterator from org.apache.lucene.index import \ IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = RAMDirectory() iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100)) iwriter = IndexWriter(directory, iconfig) ft = FieldType() ft.setStored(True) ft.setTokenized(True) ft.setStoreTermVectors(True) ft.setStoreTermVectorOffsets(True) ft.setStoreTermVectorPositions(True) ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) ts = ["this bernhard is the text to be index text", "this claudia is the text to be indexed"] for t in ts: doc = Document() doc.add(Field("fieldname", t, ft)) iwriter.addDocument(doc) iwriter.commit() iwriter.close() ireader = DirectoryReader.open(directory) for doc in xrange(0, len(ts)):
def BuildSearchEngine(start, maxPages,domain,first): #only initiate VM if it's the first being called if first == True: lucene.initVM(vmargs=['-Djava.awt.headless=true']) print ('lucene'), lucene.VERSION if not os.path.exists("IndexFiles.index"): os.mkdir("IndexFiles.index") store = SimpleFSDirectory(Paths.get("IndexFiles.index")) config = IndexWriterConfig(StandardAnalyzer(StandardAnalyzer.STOP_WORDS_SET)) #if first time being called, create new index, otherwise only append new pages into old index if first == True: config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) writer = IndexWriter(store, config) #configure settings for pages being saved t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setStoreTermVectors(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPositions(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) pagesToVisit = [start] hashtable=dict() hashtable[start]=1 numberVisited = 0 rp = robotparser.RobotFileParser() robotFileLocation = "http://www."+domain+"/robots.txt" rp.set_url(robotFileLocation) rp.read() # The main loop. Create a LinkParser and get all the links on the page. while numberVisited < maxPages and pagesToVisit != []: numberVisited = numberVisited +1 # Start from the beginning of our collection of pages to visit: url = pagesToVisit[0] pagesToVisit = pagesToVisit[1:] try: print(numberVisited, "Visiting:", url) parser = LinkParser() data, links,hashtable = parser.getLinks(url,domain,hashtable,rp) # Add the pages that we visited to the end of our collection # of pages to visit: print(" **Success!**") path = "files/a.html" urllib.urlretrieve(url,path) file = open("files/a.html") contents = removeTag(file); file.close() file = open("files/a.html","w") file.write(contents) file.close() file = open("files/a.html") contents = file.read() file.close() doc = Document() doc.add(Field("name", "a.html", t1)) doc.add(Field("path", "files", t1)) #index the url doc.add(Field("url", url, t1)) if len(contents) > 0: doc.add(Field("contents", contents.decode("utf-8").replace(u"\u2019","'").replace(u"\u2018","'").replace(u"\ufe0f","'").replace(u"\u20e3","'"), t2)) else: print ("warning: no content in %s") % filename writer.addDocument(doc) pagesToVisit = pagesToVisit + links except Exception,e: print Exception,":",e print(" **Failed!**")
class IndexFiles: def __init__(self, path, analyzer): self.path = path self._analyzer = analyzer self.errors = [] self._initialize() def index(self, csvs_path): all_csvs = [x for x in os.listdir(csvs_path) if x.endswith('csv')] for i, csv_file in enumerate(all_csvs, 1): print("\nProcessing CSV #{}".format(i), flush=True) patents = self._read_csv(csvs_path + "/" + csv_file) if patents is None: continue print("\rProcessed {}/{} patents in file".format(0, len(patents)), end='', flush=True) for j, patent in enumerate(patents, 1): pid, date, title, author, icn, org, acn, abstract, description, purpose, mechanics, uid = patent try: doc = Document() doc.add(Field('id', pid, self._ft1)) doc.add(Field('date', date, self._ft1)) doc.add(Field('title', title, self._ft2)) doc.add(Field('author', author, self._ft1)) doc.add(Field('icn', icn, self._ft1)) doc.add(Field('organization', org, self._ft1)) doc.add(Field('acn', acn, self._ft1)) doc.add(Field('abstract', abstract, self._ft2)) doc.add(Field('description', description, self._ft2)) doc.add(Field('purpose', purpose, self._ft2)) doc.add(Field('mechanics', mechanics, self._ft2)) doc.add(Field('uid', uid, self._ft1)) self._writer.addDocument(doc) except Exception as e: print("\nFailed to index '{}': {}\n".format(csvs_path, e)) print("\rProcessed {}/{} patents in file".format( j, len(patents)), end='', flush=True) print() self._commit() return self def _read_csv(self, path): try: with open(path, 'rU', newline='') as fs: reader = csv.reader(x.replace('\0', '') for x in fs) rows = [r for r in reader] return rows except Exception as e: print("\nError reading file '{}' : {} \n".format(path, e)) return None def _commit(self): ticker = Ticker() print("Commiting index", end='', flush=True) threading.Thread(target=ticker.run).start() self._writer.commit() self._writer.close() ticker.tick = False print("Done!") def _initialize(self): if not os.path.exists(self.path): os.mkdir(self.path) self._analyzer = LimitTokenCountAnalyzer(self._analyzer, 1048576) self._store = SimpleFSDirectory(Paths.get(self.path)) self._config = IndexWriterConfig(self._analyzer) self._config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self._writer = IndexWriter(self._store, self._config) self._set_fieldtypes() def _set_fieldtypes(self): self._ft1 = FieldType() self._ft1.setStored(True) self._ft1.setTokenized(False) self._ft1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self._ft2 = FieldType() self._ft2.setStored(True) self._ft2.setTokenized(True) self._ft1.setStoreTermVectors(True) self._ft1.setStoreTermVectorOffsets(True) self._ft1.setStoreTermVectorPositions(True) self._ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
import lucene # for customized field from org.apache.lucene.document import Field from org.apache.lucene.document import FieldType from org.apache.lucene.index import IndexOptions lucene.initVM(vmargs=['-Djava.awt.headless=true']) CUSTOM_FIELD_TEXT=FieldType() CUSTOM_FIELD_TEXT.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) CUSTOM_FIELD_TEXT.setStored(True) CUSTOM_FIELD_TEXT.setStoreTermVectors(True) CUSTOM_FIELD_TEXT.setStoreTermVectorPositions(True) CUSTOM_FIELD_TEXT.setStoreTermVectorOffsets(True) #CUSTOM_FIELD_TEXT.setStoreTermVectorPayloads(True) CUSTOM_FIELD_TEXT.setTokenized(True) CUSTOM_FIELD_TEXT_BF=FieldType() CUSTOM_FIELD_TEXT_BF.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) CUSTOM_FIELD_TEXT_BF.setStored(False) CUSTOM_FIELD_TEXT_BF.setStoreTermVectors(True) CUSTOM_FIELD_TEXT_BF.setStoreTermVectorPositions(True) CUSTOM_FIELD_TEXT_BF.setStoreTermVectorOffsets(True) CUSTOM_FIELD_TEXT_BF.setTokenized(True) CUSTOM_FIELD_TEXT_DF=FieldType() CUSTOM_FIELD_TEXT_DF.setIndexOptions(IndexOptions.DOCS_AND_FREQS) CUSTOM_FIELD_TEXT_DF.setStored(False) CUSTOM_FIELD_TEXT_DF.setStoreTermVectors(True) CUSTOM_FIELD_TEXT_DF.setStoreTermVectorPositions(False) CUSTOM_FIELD_TEXT_DF.setStoreTermVectorOffsets(False)
import lucene # for customized field from org.apache.lucene.document import Field from org.apache.lucene.document import FieldType from org.apache.lucene.index import IndexOptions lucene.initVM(vmargs=['-Djava.awt.headless=true']) CUSTOM_FIELD_TEXT = FieldType() CUSTOM_FIELD_TEXT.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) CUSTOM_FIELD_TEXT.setStored(True) CUSTOM_FIELD_TEXT.setStoreTermVectors(True) CUSTOM_FIELD_TEXT.setStoreTermVectorPositions(True) CUSTOM_FIELD_TEXT.setStoreTermVectorOffsets(True) #CUSTOM_FIELD_TEXT.setStoreTermVectorPayloads(True) CUSTOM_FIELD_TEXT.setTokenized(True) # Note that the difference between endOffset() and startOffset() may not be equal to termText.length(), as the term text may have been altered by a stemmer or some other filter.