-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
70 lines (56 loc) · 1.95 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
import json
import os
import requests
import lucene
from bs4 import BeautifulSoup
from java.io import File
from datetime import datetime
from org.apache.lucene.document import Document, Field, StringField, TextField
from org.apache.lucene.util import Version
from org.apache.lucene.store import RAMDirectory, FSDirectory
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser, QueryParserBase, MultiFieldQueryParser
luceneDirectory = "./index/"
def create_doc(data):
screen_name = data['screen_name']
tweet = data['tweet']
tweet_date = data['tweet_date']
tweet_location = data['tweet_location']
page_title = data['page_title']
doc = Document()
doc.add(TextField("username", screen_name, Field.Store.YES))
doc.add(TextField("text", tweet, Field.Store.YES))
doc.add(TextField("date", tweet_date, Field.Store.YES))
if tweet_location:
doc.add(TextField("location", tweet_location, Field.Store.YES))
if page_title:
doc.add(TextField("page title", page_title, Field.Store.YES))
return doc
def index():
indexFile = File(luceneDirectory).toPath()
directory = FSDirectory.open(indexFile)
analyzer = StandardAnalyzer()
analyzer = LimitTokenCountAnalyzer(analyzer, 128479)
writeConfig = IndexWriterConfig(analyzer)
writer = IndexWriter(directory, writeConfig)
file_number = 2
while file_number <= 200:
data = []
file_name = './parsed/parsed_data' + str(file_number) + '.txt'
with open(file_name) as f:
for line in f:
data.append(json.loads(line))
f.close()
for j in data:
doc = create_doc(j)
writer.addDocument(doc)
file_number += 1
writer.close()
if __name__ == '__main__':
lucene.initVM()
index()