Exemple #1
0
def retrival_answer(MAX):
    lucene.initVM()
    directory = RAMDirectory()

    indexDir = SimpleFSDirectory(Paths.get('index'))
    writerConfig = IndexWriterConfig(StandardAnalyzer())
    writer = IndexWriter(directory, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading lines from Document..."

    process_doc = open("Huawei_result/document.txt", "r")
    doc_line = process_doc.readlines()
    for l in doc_line:
        doc = Document()
        doc.add(TextField("text", l, Field.Store.YES))
        writer.addDocument(doc)
    print "Indexed from %d docs in index" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

    accuracy = []
    process_query = open("Huawei_result/query.txt", "r")
    query_line = process_query.readlines()
    for n, one_query in enumerate(query_line):
        analyzer = StandardAnalyzer()
        # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index')))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        # searcher = IndexSearcher(reader)
        query = QueryParser("text", analyzer).parse(one_query)
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        # print "The groundtruth document is:", doc_line[n]
        candidate_doc = []
        for hit in hits.scoreDocs:
            # print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            # print doc.get("text").encode("utf-8")
            candidate_doc.append(doc.get("text"))

        choices = process.extract(unicode(doc_line[n]), candidate_doc)
        flag = 0
        for i in range(len(choices)):
            if choices[i][1] >= 89:
                flag = 1
        if flag == 1:
            accuracy.append(1)
        else:
            accuracy.append(0)

    final_accuracy = float(sum(accuracy)) / float(len(accuracy))

    print "the final accuracy is:", final_accuracy
Exemple #2
0
def main(index_dir, input_dir):
    """Creates a Lucene Index, and indexes every .json file it finds.
    It utilizes a stopwords.txt to filter out stop words"""
    lucene.initVM()

    logger.info("Loading stop words from stopwords.txt")
    f = open('stopwords.txt', 'r')
    stopwords = set([])
    for line in f:
        stopwords.add(line.strip())
    f.close()
    logger.debug('Stop words: %s' % str(stopwords))
    temp = CharArraySet(1, True)

    for stopword in stopwords:
        temp.add(stopword)

    stopwords = temp

    # Create index
    logger.info("Creating Lucene index [%s]..." % index_dir)

    fs_dir = SimpleFSDirectory(Paths.get(index_dir))
    analyzer = StandardAnalyzer(stopwords)
    writerConfig = IndexWriterConfig(analyzer)
    writer = IndexWriter(fs_dir, writerConfig)

    logger.info("Currently there are %d documents in the index..." %
                writer.numDocs())

    # Index documents
    onlyfiles = [
        f for f in listdir(input_dir)
        if isfile(join(input_dir, f)) and f.endswith('.json')
    ]
    for f in onlyfiles:
        try:
            journal_code = f.split('.')[0]
            f = join(input_dir, f)
            json_data = open(f)
            data = json.load(json_data)
            for entry in data:
                doc = Document()
                doc.add(StringField("journal", journal_code, Field.Store.YES))
                doc.add(StringField("url", entry['url'], Field.Store.YES))
                doc.add(StringField("date", entry['date'], Field.Store.YES))
                doc.add(TextField("title", entry['title'], Field.Store.YES))
                writer.addDocument(doc)
            json_data.close()
        except IOError as v:
            try:
                (code, message) = v
            except (TypeError, ValueError):
                code = 0
                message = v
            logger.error("I/O Error: " + str(message) + " (" + str(code) + ")")
    logger.info("Indexed lines from stdin (%d documents in index)" %
                writer.numDocs())

    # Wrap it up
    # logger.info("About to optimize index of %d documents..." % writer.numDocs())
    # writer.optimize()
    # logger.info("...done optimizing index of %d documents" % writer.numDocs())

    logger.info("Closing index of %d documents..." % writer.numDocs())
    writer.close()

    reader = DirectoryReader.open(fs_dir)
    with open('all.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile,
                               delimiter=',',
                               quotechar='"',
                               quoting=csv.QUOTE_ALL)
        for i in range(0, reader.numDocs()):
            doc = reader.document(i)
            csvwriter.writerow([
                doc.get('journal'),
                doc.get('date'),
                doc.get('url'),
                doc.get('title').strip().replace(',', '\,')
            ])
Exemple #3
0
def main(indexDir, inputDir):
	"""Creates a Lucene Index, and indexes every .json file it finds.
	It utilizes a stopwords.txt to filter out stop words"""
	lucene.initVM()

	logger.info("Loading stop words from stopwords.txt")
	f = open('stopwords.txt', 'r')
	stopwords = set([])
	for line in f:
		stopwords.add(line.strip())
	f.close()
	logger.debug('Stop words: %s' % str(stopwords))
	temp = CharArraySet(Version.LUCENE_CURRENT, 1, True)

	for stopword in stopwords:
		temp.add(stopword)

	stopwords = temp

	# Create index
	logger.info("Creating Lucene index [%s]..." % indexDir)

	dir = SimpleFSDirectory(File(indexDir))
	analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopwords)
	writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
	writer = IndexWriter(dir, writerConfig)

	logger.info("Currently there are %d documents in the index..." % writer.numDocs())

	# Index documents
	onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ]
	for f in onlyfiles:
		try:
			journal_code = f.split('.')[0]
			f = join(inputDir, f)
			json_data = open(f)
			data = json.load(json_data)
			for entry in data:
				doc = Document()
				doc.add(Field("journal", journal_code, Field.Store.YES, Field.Index.NOT_ANALYZED))
				doc.add(Field("url", entry['url'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
				doc.add(Field("date", entry['date'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
				doc.add(Field("title", entry['title'], Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			json_data.close()
		except (IOError) as v:
			try:
				(code, message) = v
			except:
				code = 0
				message = v
			logger.error("I/O Error: " + str(message) + " (" + str(code) + ")")
	logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs())

	# Wrap it up
	#logger.info("About to optimize index of %d documents..." % writer.numDocs())
	#writer.optimize()
	#logger.info("...done optimizing index of %d documents" % writer.numDocs())

	logger.info("Closing index of %d documents..." % writer.numDocs())
	writer.close()

	reader = IndexReader.open(dir)
	with open('all.csv', 'wb') as csvfile:
		csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
		for i in xrange(0, reader.numDocs()):
			doc = reader.document(i)
			csvwriter.writerow([doc.get('journal'), doc.get('date'), doc.get('url').encode('utf8'), \
				doc.get('title').strip().replace(',', '\,').encode('utf8')])
    # Criação de diferentes campos para os diferentes campos parseados e adição desses documentos no index.
    addDoc("name", r['name'], writer)
    addDoc("research", r['research'], writer)
    writer.commit()
    writer.close()

# Nesse momento é realizada a busca dos termos dentro do índice.
searcher = IndexSearcher(DirectoryReader.open(store))
query = FuzzyQuery(Term("research", "programaçao"))

MAX = 1000
hits = searcher.search(query, MAX)

for hit in hits.scoreDocs:
    doc = searcher.doc(hit.doc)
    professorList.append(doc.get("name"))

app = Flask(__name__)


@app.route('/')
def index():
    return render_template('home.html')


@app.route('/contents', methods=['POST'])
def contents():
    term = request.form['content']

    return render_template('contents.html',
                           term=term,