Esempio n. 1
0
    def index_docs(self, train_set, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for ii in train_set:
            doc = Document()
            doc.add(Field("answer", ii['Answer'], t1))
            doc.add(Field("qid", ii['Question ID'], t1))
            doc.add(Field("category", ii['category'], t1))
            doc.add(Field("position", ii['Sentence Position'], t1))
            doc.add(Field("question", ii['Question Text'], t2))
            doc.add(Field("wiki_plain",
                          self.wiki_reader.get_text(ii['Answer']), t2))
            writer.addDocument(doc)
Esempio n. 2
0
    def index_docs(self, tweets, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        t1.setStoreTermVectors(True)
        t1.setStoreTermVectorOffsets(True)

        # add each tweet to the index
        for tweet in tweets:
            try:
                # strip out URLs because they provide false index matches
                contents = []
                for word in tweet[1].text.split():
                    if word.startswith("http://") or word.startswith("https://"):
                        continue
                    contents.append(word)
                contents = " ".join(contents)

                if len(contents) == 0: continue

                doc = Document()
                doc.add(Field("contents", contents, t1))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in index_docs:", e
Esempio n. 3
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setStoreTermVectors(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPositions(True)
        t2.setIndexOptions(
            IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

        file_path = root + 'r52-train-all-terms.txt'
        fd = open(file_path)
        contents = fd.readlines()
        fd.close()
        contents_list = [x.strip() for x in contents]
        for i in xrange(len(contents_list)):
            try:
                [topic, content] = contents_list[i].split('\t')
                doc = Document()
                doc.add(Field("id", str(i), t1))
                doc.add(Field("topic", topic, t1))
                doc.add(Field("contents", content, t2))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e
Esempio n. 4
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.json'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = json.load(file)
                    file.close()
                    doc = Document()
                    doc.add(Field("title", contents['title'], t2))
                    doc.add(Field("path", root, t1))
                    doc.add(Field("playStoreURL", contents['playStoreURL'],
                                  t1))
                    doc.add(Field("creator", contents['creator'], t1))
                    extendedInfo = contents['extendedInfo']
                    if len(extendedInfo['description']) > 0:
                        doc.add(
                            Field("description", extendedInfo['description'],
                                  t2))
                    else:
                        print "warning: no description in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
    def tweetIndexer(self, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        x = 0
        for i in range(0,500):
            if not os.path.isfile("json/tweets-" + str(i) + ".json"):
                break

            print "adding tweets-" + str(i) + ".json"
            tweets = open("json/tweets-" + str(i) + ".json", "r")

            for line in tweets.readlines():
                tweet = json.loads(line)
                if 'limit' in tweet:
                    continue
                try:
                    doc = Document()
                    doc.add(Field("file", "json/tweets-" + str(i) + ".json", t1))
                    sname = tweet['user']['screen_name']
                    tid = str(tweet['id'])
                    text = tweet['text']
                    uname = tweet['user']['name']
                    created = tweet['created_at']
                    tstamp = tweet['timestamp_ms']
                    place = ""
                    if tweet['place']:
                        place = tweet['place']['full_name'] + ", " + tweet['place']['country']
                    lat = ""
                    lng = ""
                    titles = ""
                    urls = ""
                    exist = "false"

                    if tweet['coordinates']:
                        lat = str(tweet['coordinates']['coordinates'][1])
                        lng = str(tweet['coordinates']['coordinates'][0])
                    else:
                        lat = str((tweet['place']['bounding_box']['coordinates'][0][0][1] + tweet['place']['bounding_box']['coordinates'][0][2][1])/2)
                        lng = str((tweet['place']['bounding_box']['coordinates'][0][0][0] + tweet['place']['bounding_box']['coordinates'][0][2][0])/2)
                    
                    if len(tweet['entities']['urls']) != 0:
                        exist = "true"
                        for index in range(len(tweet['entities']['urls'])):
                            title = tweet['entities']['urls'][index]['url_title']
                            if title == None:
                                titles += ",-"
                            else:
                                title = title.encode('ascii','ignore')
                                titles += "," + str(title)
                            urls += " " + str(tweet['entities']['urls'][index]['expanded_url'])


                    searchable = text + " " + urls + " " + uname + " " + sname + " " + place
                    doc.add(Field("lookup", searchable, t2))
                    doc.add(Field("text", text, t2))
                    doc.add(Field("user_name", uname, t2)) 
                    doc.add(Field("screen_name", sname, t2))                    
                    doc.add(Field("tweet_id", tid, t2))
                    doc.add(Field("created_at", created, t2))
                    doc.add(Field("geo_lat", lat, t2))
                    doc.add(Field("geo_lng", lng, t2))
                    doc.add(Field("url_exist", exist, t2))
                    doc.add(Field("url_url", urls, t2))
                    doc.add(Field("url_title", titles, t2))
                    doc.add(Field("timestamp", tstamp, t2))
                    writer.addDocument(doc)
                    x += 1
                except Exception, e:
                    pass
            tweets.close()
Esempio n. 6
0
from org.apache.lucene.util import BytesRef, BytesRefIterator
from org.apache.lucene.index import \
    IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions

if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

directory = RAMDirectory()
iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100))
iwriter = IndexWriter(directory, iconfig)

ft = FieldType()
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

ts = [
    "this bernhard is the text to be index text",
    "this claudia is the text to be indexed"
]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t, ft))
    iwriter.addDocument(doc)

iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)
Esempio n. 7
0
from org.apache.lucene.util import BytesRef, BytesRefIterator
from org.apache.lucene.index import \
    IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions

if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

directory = RAMDirectory()
iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100))
iwriter = IndexWriter(directory, iconfig)

ft = FieldType()
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

ts = ["this bernhard is the text to be index text",
      "this claudia is the text to be indexed"]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t, ft))
    iwriter.addDocument(doc)

iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)

for doc in xrange(0, len(ts)):
Esempio n. 8
0
def BuildSearchEngine(start, maxPages,domain,first):  
	#only initiate VM if it's the first being called
	if first == True:
		lucene.initVM(vmargs=['-Djava.awt.headless=true'])
	print ('lucene'), lucene.VERSION
	if not os.path.exists("IndexFiles.index"):
		os.mkdir("IndexFiles.index")
	store = SimpleFSDirectory(Paths.get("IndexFiles.index"))
	config = IndexWriterConfig(StandardAnalyzer(StandardAnalyzer.STOP_WORDS_SET))
	#if first time being called, create new index, otherwise only append new pages into old index
	if first == True:
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
	else:
		config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
	writer = IndexWriter(store, config)
	
	#configure settings for pages being saved
	t1 = FieldType()
	t1.setStored(True)
	t1.setTokenized(False)
	t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
	t2 = FieldType()
	t2.setStored(False)
	t2.setTokenized(True)
	t2.setStoreTermVectors(True)
	t2.setStoreTermVectorOffsets(True)
	t2.setStoreTermVectorPositions(True)
	t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
	
	pagesToVisit = [start]
	hashtable=dict()
	hashtable[start]=1
	numberVisited = 0
	rp = robotparser.RobotFileParser()
	robotFileLocation = "http://www."+domain+"/robots.txt"
	rp.set_url(robotFileLocation)
	rp.read()
	# The main loop. Create a LinkParser and get all the links on the page.
	while numberVisited < maxPages and pagesToVisit != []:
			numberVisited = numberVisited +1
			# Start from the beginning of our collection of pages to visit:
			url = pagesToVisit[0]
			pagesToVisit = pagesToVisit[1:]
		try:
			print(numberVisited, "Visiting:", url)
			parser = LinkParser()
			data, links,hashtable = parser.getLinks(url,domain,hashtable,rp)
			# Add the pages that we visited to the end of our collection
			# of pages to visit:
			print(" **Success!**")
			path = "files/a.html"
			urllib.urlretrieve(url,path)
			file = open("files/a.html")
			contents = removeTag(file);
			file.close()
			file = open("files/a.html","w")
			file.write(contents)
			file.close()
			file = open("files/a.html")
			contents = file.read()
			file.close()
			doc = Document()
			doc.add(Field("name", "a.html", t1))
			doc.add(Field("path", "files", t1))
			#index the url
			doc.add(Field("url", url, t1))
			if len(contents) > 0:
				doc.add(Field("contents", contents.decode("utf-8").replace(u"\u2019","'").replace(u"\u2018","'").replace(u"\ufe0f","'").replace(u"\u20e3","'"), t2))
			else:
				print ("warning: no content in %s") % filename
			writer.addDocument(doc)
			pagesToVisit = pagesToVisit + links
		except Exception,e:
			print Exception,":",e
			print(" **Failed!**")
class IndexFiles:
    def __init__(self, path, analyzer):
        self.path = path
        self._analyzer = analyzer
        self.errors = []
        self._initialize()

    def index(self, csvs_path):

        all_csvs = [x for x in os.listdir(csvs_path) if x.endswith('csv')]

        for i, csv_file in enumerate(all_csvs, 1):
            print("\nProcessing CSV #{}".format(i), flush=True)

            patents = self._read_csv(csvs_path + "/" + csv_file)
            if patents is None:
                continue
            print("\rProcessed {}/{} patents in file".format(0, len(patents)),
                  end='',
                  flush=True)
            for j, patent in enumerate(patents, 1):

                pid, date, title, author, icn, org, acn, abstract, description, purpose, mechanics, uid = patent

                try:
                    doc = Document()
                    doc.add(Field('id', pid, self._ft1))
                    doc.add(Field('date', date, self._ft1))
                    doc.add(Field('title', title, self._ft2))
                    doc.add(Field('author', author, self._ft1))
                    doc.add(Field('icn', icn, self._ft1))
                    doc.add(Field('organization', org, self._ft1))
                    doc.add(Field('acn', acn, self._ft1))
                    doc.add(Field('abstract', abstract, self._ft2))
                    doc.add(Field('description', description, self._ft2))
                    doc.add(Field('purpose', purpose, self._ft2))
                    doc.add(Field('mechanics', mechanics, self._ft2))
                    doc.add(Field('uid', uid, self._ft1))

                    self._writer.addDocument(doc)

                except Exception as e:
                    print("\nFailed to index '{}': {}\n".format(csvs_path, e))
                print("\rProcessed {}/{} patents in file".format(
                    j, len(patents)),
                      end='',
                      flush=True)
            print()
        self._commit()
        return self

    def _read_csv(self, path):

        try:
            with open(path, 'rU', newline='') as fs:
                reader = csv.reader(x.replace('\0', '') for x in fs)
                rows = [r for r in reader]

            return rows
        except Exception as e:
            print("\nError reading file '{}' : {} \n".format(path, e))
            return None

    def _commit(self):

        ticker = Ticker()
        print("Commiting index", end='', flush=True)
        threading.Thread(target=ticker.run).start()
        self._writer.commit()
        self._writer.close()
        ticker.tick = False
        print("Done!")

    def _initialize(self):

        if not os.path.exists(self.path):
            os.mkdir(self.path)

        self._analyzer = LimitTokenCountAnalyzer(self._analyzer, 1048576)
        self._store = SimpleFSDirectory(Paths.get(self.path))
        self._config = IndexWriterConfig(self._analyzer)
        self._config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self._writer = IndexWriter(self._store, self._config)
        self._set_fieldtypes()

    def _set_fieldtypes(self):

        self._ft1 = FieldType()
        self._ft1.setStored(True)
        self._ft1.setTokenized(False)
        self._ft1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self._ft2 = FieldType()
        self._ft2.setStored(True)
        self._ft2.setTokenized(True)
        self._ft1.setStoreTermVectors(True)
        self._ft1.setStoreTermVectorOffsets(True)
        self._ft1.setStoreTermVectorPositions(True)
        self._ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
Esempio n. 10
0
import lucene
# for customized field
from org.apache.lucene.document import Field
from org.apache.lucene.document import FieldType
from org.apache.lucene.index import IndexOptions

lucene.initVM(vmargs=['-Djava.awt.headless=true'])

CUSTOM_FIELD_TEXT=FieldType()
CUSTOM_FIELD_TEXT.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
CUSTOM_FIELD_TEXT.setStored(True)
CUSTOM_FIELD_TEXT.setStoreTermVectors(True)
CUSTOM_FIELD_TEXT.setStoreTermVectorPositions(True)
CUSTOM_FIELD_TEXT.setStoreTermVectorOffsets(True)
#CUSTOM_FIELD_TEXT.setStoreTermVectorPayloads(True)
CUSTOM_FIELD_TEXT.setTokenized(True)

CUSTOM_FIELD_TEXT_BF=FieldType()
CUSTOM_FIELD_TEXT_BF.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
CUSTOM_FIELD_TEXT_BF.setStored(False)
CUSTOM_FIELD_TEXT_BF.setStoreTermVectors(True)
CUSTOM_FIELD_TEXT_BF.setStoreTermVectorPositions(True)
CUSTOM_FIELD_TEXT_BF.setStoreTermVectorOffsets(True)
CUSTOM_FIELD_TEXT_BF.setTokenized(True)

CUSTOM_FIELD_TEXT_DF=FieldType()
CUSTOM_FIELD_TEXT_DF.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
CUSTOM_FIELD_TEXT_DF.setStored(False)
CUSTOM_FIELD_TEXT_DF.setStoreTermVectors(True)
CUSTOM_FIELD_TEXT_DF.setStoreTermVectorPositions(False)
CUSTOM_FIELD_TEXT_DF.setStoreTermVectorOffsets(False)
Esempio n. 11
0
import lucene
# for customized field
from org.apache.lucene.document import Field
from org.apache.lucene.document import FieldType
from org.apache.lucene.index import IndexOptions

lucene.initVM(vmargs=['-Djava.awt.headless=true'])

CUSTOM_FIELD_TEXT = FieldType()
CUSTOM_FIELD_TEXT.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
CUSTOM_FIELD_TEXT.setStored(True)
CUSTOM_FIELD_TEXT.setStoreTermVectors(True)
CUSTOM_FIELD_TEXT.setStoreTermVectorPositions(True)
CUSTOM_FIELD_TEXT.setStoreTermVectorOffsets(True)
#CUSTOM_FIELD_TEXT.setStoreTermVectorPayloads(True)
CUSTOM_FIELD_TEXT.setTokenized(True)

# Note that the difference between endOffset() and startOffset() may not be equal to termText.length(), as the term text may have been altered by a stemmer or some other filter.