def __init__(self, input_path): # Temporarily removing from arguments: ,input_file
     self.txt_files = glob.glob(input_path+"en*.txt")
     self.TITLE = "title"
     self.TEXT = "text"
     self.directory = input_path
     # Store the index on disk:
     self.in_directory = SimpleFSDirectory.open(Paths.get(os.path.join(self.directory, INDEX_DIR)))
     self.in_directory_English = SimpleFSDirectory.open(Paths.get(os.path.join(self.directory, INDEX_DIR_ENG)))
     self.in_directory_English_lemma = SimpleFSDirectory.open(Paths.get(os.path.join(self.directory, INDEX_DIR_EL)))
     self.queries = []
     self.sp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
     self.sp_bodies_lemma = []
     self.sp_bodies_pos = []
     self.query_lemma = ""
     self.query_pos = ""
     self.prec_at_1 =0
     pass
Example #2
0
def main(args):
    global verbose
    verbose = args.verbose

    if verbose:
        logger.info(f'Read {args.dir_index}')
    directory = SimpleFSDirectory.open(Paths.get(args.dir_index))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    reader = searcher.getIndexReader()

    if verbose:
        logger.info(f'Write to {args.path_output}')
    with open(args.path_output, 'w') as f:
        for idx in trange(reader.maxDoc()):
            doc = reader.document(idx)
            babelnet_id = doc.get('ID')
            synset_id = doc.get('SYNSET_ID')
            pos = doc.get('POS')
            synset_type = doc.get('TYPE')
            main_sense = doc.get('MAIN_SENSE')
            categories = list(doc.getValues('CATEGORY'))
            translation_mappings = list(doc.getValues('TRANSLATION_MAPPING'))
            images = list(doc.getValues('IMAGE'))
            lemmas = doc.getValues('LEMMA')
            forms = []
            for i in range(len(lemmas)):
                forms.append({
                    'lemma': lemmas[i],
                    'source': doc.getValues('LEMMA_SOURCE')[i],
                    'lang': doc.getValues('LEMMA_LANGUAGE')[i],
                    'weight': doc.getValues('LEMMA_WEIGHT')[i],
                    'sense_key': doc.getValues('LEMMA_SENSEKEY')[i],
                })
            entry = {
                'id': babelnet_id,
                'synset': synset_id,
                'pos': pos,
                'type': synset_type,
                'main_sense': main_sense,
                'categories': categories,
                'translation_mappings': translation_mappings,
                'images': images,
                'forms': forms
            }
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')

    return 0
Example #3
0
    keyterm = line[0]
    doc.add(StringField("keyterm", keyterm, Field.Store.YES))
    index = line[1]
    doc.add(StringField("Sno", index, Field.Store.YES))
    del line[0:2]
    line = ' '.join(line)
    qterm = keyterm.replace("_", " ")
    if qterm not in line:
        line = qterm + ' ' + line
    doc.add(TextField("text", line, Field.Store.YES))
    return doc


lucene.initVM()
index_path = File(INDEX_DIR).toPath()
directory = SimpleFSDirectory.open(index_path)
analyzer = StandardAnalyzer()
config = IndexWriterConfig(analyzer)
writer = IndexWriter(directory, config)
print("Number of documents:", writer.numDocs())

for input_file in listdir(INPUT_DIR):
    print("Current file:", input_file)
    if input_file.endswith(".txt"):
        path = INPUT_DIR + input_file
        with open(path) as file:
            line = file.readline()
            while (line):
                line = file.readline()
                if len(line.strip()) != 0:
                    doc = create_document(line)
    def run(self, writer=None, analyzer=None):

        if writer is None:
            writer = self.writer

        if analyzer is None:
            analyzer = self.analyzer

        searcher = IndexSearcher(DirectoryReader.open(\
        SimpleFSDirectory.open(File(self.store_dir))))
        while True:
            print
            print "Hit enter with no input to quit."
            command = raw_input("Query:")
            if command == '':
                return

            print "Searching for:", command
            query = QueryParser(Version.LUCENE_43, "contents",
                analyzer).parse(command)

            # We'll just show the top 10 matching documents for now
            scoreDocs = searcher.search(query, 10).scoreDocs
            print "%s total matching documents." % len(scoreDocs)

            # Highlight the matching text in red
            highlighter = Highlighter(SimpleHTMLFormatter('<b><font color\
            ="red">', '</font></b>'), QueryScorer(query))

            # Using NullFragmenter since we still want to see
            # the whole document
            highlighter.setTextFragmenter(NullFragmenter())

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream("contents",
                    StringReader(doc.get("contents")))

                # arg 3: the maximum number of fragments
                # arg 4: the separator used to intersperse the
                # document fragments (typically "...")
                # arg 3 and 4 don't really matter with NullFragmenter
                result = highlighter.getBestFragments(tokenStream,
                    doc.get("contents"), 2, "...")

                if len(result) > 10:
                    file_handler = open(self.hits_dir + '/' + doc.get("name"),
                        'w+')
                    file_handler.write(result)

            # create hit fragments, if we want to show them
            # arg 1: fragment size
            highlighter.setTextFragmenter(SimpleFragmenter(200))

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream("contents",
                    StringReader(doc.get("contents")))

                result = highlighter.getBestFragments(tokenStream,
                    doc.get("contents"), 2, "...")

                if len(result) > 10:
                    file_handler = open(self.frags_dir + '/' + doc.get("name"),
                        'w+')
                    file_handler.write(result)
Example #5
0
def create_index_from_folder(folder, index_file):
    """Lets Lucene create an index of all database files within a specified folder

    :param folder: absolute or relative path to database files
    :param index_file: absolute or relative output location for index

    Notes:
    - Does not go through database folder recursively, i.e. all files have to be at the root of the folder
    - Only CSV files are supported
    - Column headers are hardcoded and should follow:
        ID, text, Reddit ID, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold
    """
    # Set up Lucene
    print()
    print("Starting Lucene ...")
    lucene.initVM()
    index_store = SimpleFSDirectory.open(File(index_file).toPath())
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index_store, config)

    print()
    # Go through files, add rows of each as Documents to writer
    for file in os.listdir(folder):
        if file.endswith(".csv"):
            print("Indexing {} ...".format(file), end=" ", flush=True)
            with open(os.path.join(folder, file), newline='') as db:
                reader = csv.reader(db)

                # The Reddit database seems to carry a lot of duplicate posts, so we try to skip those
                post_ids = set()
                duplicate_counter = 0

                # To store term vectors (used for query expansion) we have to use a custom fieldtype
                customfield = FieldType()
                customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
                customfield.setStored(True)
                customfield.setTokenized(True)
                customfield.setStoreTermVectors(True)

                # CSV files have a useless first row...
                skipfirst = True
                # ... and a useless first column. Skip both.
                for _, text, rid, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold in reader:
                    if skipfirst:
                        skipfirst = False
                        continue
                    doc = Document()

                    if rid in post_ids:
                        duplicate_counter += 1
                        continue  # skip
                    else:
                        post_ids.add(rid)

                    # Tokenize, index and store
                    doc.add(Field("text", text, customfield))

                    # Index and store
                    doc.add(StringField("id", rid, Field.Store.YES))
                    doc.add(
                        StringField("subreddit", subreddit, Field.Store.YES))
                    doc.add(StringField("meta", meta, Field.Store.YES))
                    doc.add(StringField("time", time, Field.Store.YES))
                    doc.add(StringField("author", author, Field.Store.YES))

                    # Store only
                    doc.add(StoredField("ups", ups))
                    doc.add(StoredField("downs", downs))
                    doc.add(StoredField("authorlinkkarma", authorlinkkarma))
                    doc.add(StoredField("authorkarma", authorkarma))
                    doc.add(StoredField("authorisgold", authorisgold))

                    writer.addDocument(doc)

            print("DONE!\t(Duplicate posts skipped: {})".format(
                duplicate_counter))

    writer.commit()
    writer.close()

    print()
    print("Finished indexing!")
Example #6
0
    def run(self, writer=None, analyzer=None):

        if writer is None:
            writer = self.writer

        if analyzer is None:
            analyzer = self.analyzer

        searcher = IndexSearcher(DirectoryReader.open(\
        SimpleFSDirectory.open(File(self.store_dir))))
        while True:
            print()
            print("Hit enter with no input to quit.")
            command = input("Query:")
            if command == '':
                return

            print("Searching for:", command)
            query = QueryParser(Version.LUCENE_43, "contents",
                                analyzer).parse(command)

            # We'll just show the top 10 matching documents for now
            scoreDocs = searcher.search(query, 10).scoreDocs
            print("%s total matching documents." % len(scoreDocs))

            # Highlight the matching text in red
            highlighter = Highlighter(
                SimpleHTMLFormatter('<b><font color\
            ="red">', '</font></b>'), QueryScorer(query))

            # Using NullFragmenter since we still want to see
            # the whole document
            highlighter.setTextFragmenter(NullFragmenter())

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream(
                    "contents", StringReader(doc.get("contents")))

                # arg 3: the maximum number of fragments
                # arg 4: the separator used to intersperse the
                # document fragments (typically "...")
                # arg 3 and 4 don't really matter with NullFragmenter
                result = highlighter.getBestFragments(tokenStream,
                                                      doc.get("contents"), 2,
                                                      "...")

                if len(result) > 10:
                    file_handler = open(self.hits_dir + '/' + doc.get("name"),
                                        'w+')
                    file_handler.write(result)

            # create hit fragments, if we want to show them
            # arg 1: fragment size
            highlighter.setTextFragmenter(SimpleFragmenter(200))

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                tokenStream = analyzer.tokenStream(
                    "contents", StringReader(doc.get("contents")))

                result = highlighter.getBestFragments(tokenStream,
                                                      doc.get("contents"), 2,
                                                      "...")

                if len(result) > 10:
                    file_handler = open(self.frags_dir + '/' + doc.get("name"),
                                        'w+')
                    file_handler.write(result)