def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0, queryor=False, phrase=False): """ :param minsize: The minimum length of the N-grams. :param maxsize: The maximum length of the N-grams. :param stored: Whether to store the value of this field with the document. Since this field type generally contains a lot of text, you should avoid storing it with the document unless you need to, for example to allow fast excerpts in the search results. :param queryor: if True, combine the N-grams with an Or query. The default is to combine N-grams with an And query. :param phrase: store positions on the N-grams to allow exact phrase searching. The default is off. """ formatclass = formats.Frequency if phrase: formatclass = formats.Positions self.analyzer = NgramAnalyzer(minsize, maxsize) self.format = formatclass(field_boost=field_boost) self.stored = stored self.queryor = queryor
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() # Prepare analyzers # - standard analyzer simply splits words # - stemming extracts stems, to catch things like plurals analyzers = [ (SimpleAnalyzer(), True), (SimpleAnalyzer(expression=SPLIT_RE, gaps=True), True), (StandardAnalyzer(), False), (StemmingAnalyzer(), False), ] source_language = unit.translation.subproject.project.source_language lang_code = source_language.base_code() # Add per language analyzer if Whoosh has it if has_stemmer(lang_code): analyzers.append((LanguageAnalyzer(lang_code), False)) # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append((NgramAnalyzer(4), False)) # Extract words from all plurals and from context for text in unit.get_source_plurals() + [unit.context]: for analyzer, combine in analyzers: # Some Whoosh analyzers break on unicode new_words = [] try: new_words = [token.text for token in analyzer(text)] except (UnicodeDecodeError, IndexError) as error: report_error(error, sys.exc_info()) words.update(new_words) # Add combined string to allow match against multiple word # entries allowing to combine up to 5 words if combine: words.update([ ' '.join(new_words[x:y]) for x in range(len(new_words)) for y in range(1, min(x + 6, len(new_words) + 1)) if x != y ]) # Grab all words in the dictionary dictionary = self.filter(project=unit.translation.subproject.project, language=unit.translation.language) if '' in words: words.remove('') if len(words) == 0: # No extracted words, no dictionary dictionary = dictionary.none() else: # Build the query for fetching the words # Can not use __in as we want case insensitive lookup dictionary = dictionary.filter(source__iregex=r'^({0})$'.format( '|'.join([re_escape(word) for word in words]))) return dictionary
def get_words(self, unit): """ Returns list of word pairs for an unit. """ words = set() # Prepare analyzers # - standard analyzer simply splits words # - stemming extracts stems, to catch things like plurals analyzers = [ StandardAnalyzer(), StemmingAnalyzer(), ] source_language = unit.translation.subproject.project.source_language lang_code = source_language.base_code() # Add per language analyzer if Whoosh has it if has_stemmer(lang_code): analyzers.append(LanguageAnalyzer(lang_code)) # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context for text in unit.get_source_plurals() + [unit.context]: for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update( [token.text for token in analyzer(force_text(text))] ) except (UnicodeDecodeError, IndexError) as error: report_error(error, sys.exc_info()) # Grab all words in the dictionary dictionary = self.filter( project=unit.translation.subproject.project, language=unit.translation.language ) if len(words) == 0: # No extracted words, no dictionary dictionary = dictionary.none() else: # Build the query for fetching the words # Can not use __in as we want case insensitive lookup query = Q() for word in words: query |= Q(source__iexact=word) # Filter dictionary dictionary = dictionary.filter(query) return dictionary
def load_states(): analyzer = NgramAnalyzer(1, 2) state_schema = Schema(state=ID(stored=True, analyzer=analyzer)) with cursor() as cur: print('Loading states...') cur.execute('SELECT DISTINCT state FROM msa') state_index = storage.create_index(state_schema) writer = state_index.writer() for s in cur.fetchall(): writer.add_document(state=s[u'state']) writer.commit() return state_index
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - simple analyzer just splits words based on regexp # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError) as error: report_error(error) if len(words) > 1000: break if len(words) > 1000: break if '' in words: words.remove('') if not words: # No extracted words, no dictionary return self.none() # Build the query for fetching the words # We want case insensitive lookup return self.filter( project=unit.translation.component.project, language=unit.translation.language, source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format( '|'.join(re_escape(word) for word in islice(words, 1000))), )
def exec_comp(): ''' Method that calculates MRR: Mean Reciprocal Rank and saves a table with MRR evaluation for every search engine configuration ''' #text analyzers selected_analyzers = [ StemmingAnalyzer(), SimpleAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), FancyAnalyzer(), NgramAnalyzer(5), KeywordAnalyzer(), LanguageAnalyzer('en') ] #text analyzers sel_ana = [ 'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()', 'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)', 'KeywordAnalyzer()', 'LanguageAnalyzer()' ] #text which will be used for graph and for mrr table i = 0 #counter mrrs = [] #list where MRR values for each SE configuration will be stored #scoring functions scoring_functions = [ scoring.TF_IDF(), scoring.Frequency(), scoring.BM25F(B=0.75, content_B=1.0, K1=1.5) ] scor_func = [' TF_IDF', ' Frequency', ' BM25F'] #ground truth gt1 = pd.read_csv(os.getcwd() + "/part_1/Cranfield_DATASET/cran_Ground_Truth.tsv", sep='\t') #combinations for every chosen analyzer with every chosen scoring function for x in range(len(selected_analyzers)): for y in range(len(scoring_functions)): print(sel_ana[x] + scor_func[y]) i = i + 1 sr_1 = exec_queries( selected_analyzers[x], scoring_functions[y] ) # execute queries for the chosen configuration combination sr_1.to_csv(os.getcwd() + "/part_1/" + str(i) + "__.csv", index=False) #save results of the search engine mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1, sr_1))) #calculate MRR mrrs_saving = pd.DataFrame(mrrs) mrrs_saving.to_csv(os.getcwd() + "/part_1/mrrs.csv", index=False) #store MRR table
def __init__(self, minsize=2, maxsize=4, stored=False): """ :param stored: Whether to store the value of this field with the document. Since this field type generally contains a lot of text, you should avoid storing it with the document unless you need to, for example to allow fast excerpts in the search results. :param minsize: The minimum length of the N-grams. :param maxsize: The maximum length of the N-grams. """ self.format = Frequency(analyzer=NgramAnalyzer(minsize, maxsize)) self.scorable = True self.stored = stored
def load_cities(): analyzer = NgramAnalyzer(1) city_schema = Schema(state=ID(stored=True), city=ID(stored=True, analyzer=analyzer)) with cursor() as cur: print('Loading cities...') cur.execute('SELECT DISTINCT state, city FROM msa') city_index = storage.create_index(city_schema) writer = city_index.writer() for s in cur.fetchall(): writer.add_document(state=s[u'state'], city=s[u'city']) writer.commit() return city_index
def load_occs(): analyzer = NgramAnalyzer(3) occ_schema = Schema(occ_title=TEXT(stored=True, analyzer=analyzer), occ_code=ID(stored=True)) with cursor() as cur: print('Loading occs...') cur.execute('SELECT DISTINCT occ_code, occ_title FROM msa') occ_index = storage.create_index(occ_schema) writer = occ_index.writer() for s in cur.fetchall(): writer.add_document(occ_title=s[u'occ_title'], occ_code=s[u'occ_code']) writer.commit() return occ_index
def get_terms(self, unit): """Return list of term pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - basic simple analyzer to split on non-word chars # - simple analyzer just splits words based on regexp to catch in word dashes # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer() | stopfilter, SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError): report_error(cause="Term words parsing") if len(words) > 1000: break if len(words) > 1000: break if "" in words: words.remove("") if not words: # No extracted words, no glossary return self.none() # Build the query for fetching the words # We want case insensitive lookup words = islice(words, 1000) if settings.DATABASES["default"][ "ENGINE"] == "django.db.backends.postgresql": # Use regex as that is utilizing pg_trgm index results = self.filter( source__iregex=r"(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])". format("|".join(re_escape(word) for word in words)), ) else: # MySQL results = self.filter( reduce( lambda x, y: x | y, (models.Q(source__search=word) for word in words), ), ) return results.for_project(unit.translation.component.project).filter( language=unit.translation.language)
# num_added_records_so_far += 1 if (num_added_records_so_far % 100 == 0): print(" num_added_records_so_far= " + str(num_added_records_so_far)) # writer.commit() # it is necessary to store the index once filled in_file.close() # it is necessary to close the .csv file ''' Here "schemas" function is used to create and fill all the schemas(indexes) for both .csv files (Cranfield.csv and Time.csv) ''' analyzers = [StemmingAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), SimpleAnalyzer(), FancyAnalyzer(), NgramAnalyzer(4), KeywordAnalyzer(), LanguageAnalyzer('en')] # all the analyzers that are used analyzer_names = ['StemmingAnalyzer', 'StandardAnalyzer', 'RegexAnalyzer', 'SimpleAnalyzer', 'FancyAnalyzer', 'NgramAnalyzer', 'KeywordAnalyzer', 'LanguageAnalyzer'] # analyzers names csv_names = ['Cranfield', 'Time'] # file names # start to iterate over all the .csv files (in particular the only two that there are, Cranfield.csv, and Time.csv) for name in csv_names: print(name, '\n\n') path = "C:./"+name+"_DATASET" # get the path where the .csv is stored for e,type_analyzer in enumerate(analyzers): # now the iteration is necessary to create the 8 different inverted indexes
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - simple analyzer just splits words based on regexp # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError) as error: report_error(error) if len(words) > 1000: break if len(words) > 1000: break if "" in words: words.remove("") if not words: # No extracted words, no dictionary return self.none() # Build the query for fetching the words # We want case insensitive lookup words = islice(words, 1000) if settings.DATABASES["default"][ "ENGINE"] == "django.db.backends.postgresql": results = self.filter(source__search=reduce( lambda x, y: x | y, (SearchQuery(word) for word in words)), ) else: # MySQL results = self.filter( reduce( lambda x, y: x | y, (models.Q(source__search=word) for word in words), ), ) return results.filter( project=unit.translation.component.project, language=unit.translation.language, )
from whoosh.fields import Schema, TEXT, STORED, NGRAMWORDS from whoosh.index import create_in, open_dir from whoosh.analysis import StandardAnalyzer, StemmingAnalyzer, NgramFilter, NgramAnalyzer, NgramWordAnalyzer #from whoosh.query import * from whoosh.qparser import QueryParser, MultifieldParser, FieldsPlugin analyzer = NgramAnalyzer(3) schema = Schema( id=STORED, category=TEXT(field_boost=3.0), #title = TEXT(analyzer, False) title=NGRAMWORDS(2, 20, False, 2.0)) index = create_in("search", schema) #index = open_dir("search") writer = index.writer() writer.add_document(id=0, title="Test Words") writer.add_document(id=1, title="Apple Banana Cucumber") writer.add_document(id=2, title="Deck Elevator Floor", category="test") writer.add_document(id=3, title="Pen Pineapple Apple Pen") writer.commit() #parser = QueryParser("title", schema) parser = MultifieldParser(["category", "title"], schema, { "category": 3.0, "title": 2.0 }) parser.remove_plugin_class(FieldsPlugin) with index.searcher() as searcher: