def __init__(self, stored=False, lowercase=False, commas=False, vector=None, scorable=False, unique=False, field_boost=1.0, spelling=False): """ :param stored: Whether to store the value of the field with the document. :param comma: Whether this is a comma-separated field. If this is False (the default), it is treated as a space-separated field. :param scorable: Whether this field is scorable. """ self.analyzer = KeywordAnalyzer(lowercase=lowercase, commas=commas) self.format = formats.Frequency(field_boost=field_boost) self.scorable = scorable self.stored = stored self.unique = unique self.spelling = spelling if vector: if type(vector) is type: vector = vector() elif isinstance(vector, formats.Format): pass else: vector = self.format else: vector = None self.vector = vector
def exec_comp(): ''' Method that calculates MRR: Mean Reciprocal Rank and saves a table with MRR evaluation for every search engine configuration ''' #text analyzers selected_analyzers = [ StemmingAnalyzer(), SimpleAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), FancyAnalyzer(), NgramAnalyzer(5), KeywordAnalyzer(), LanguageAnalyzer('en') ] #text analyzers sel_ana = [ 'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()', 'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)', 'KeywordAnalyzer()', 'LanguageAnalyzer()' ] #text which will be used for graph and for mrr table i = 0 #counter mrrs = [] #list where MRR values for each SE configuration will be stored #scoring functions scoring_functions = [ scoring.TF_IDF(), scoring.Frequency(), scoring.BM25F(B=0.75, content_B=1.0, K1=1.5) ] scor_func = [' TF_IDF', ' Frequency', ' BM25F'] #ground truth gt1 = pd.read_csv(os.getcwd() + "/part_1/Cranfield_DATASET/cran_Ground_Truth.tsv", sep='\t') #combinations for every chosen analyzer with every chosen scoring function for x in range(len(selected_analyzers)): for y in range(len(scoring_functions)): print(sel_ana[x] + scor_func[y]) i = i + 1 sr_1 = exec_queries( selected_analyzers[x], scoring_functions[y] ) # execute queries for the chosen configuration combination sr_1.to_csv(os.getcwd() + "/part_1/" + str(i) + "__.csv", index=False) #save results of the search engine mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1, sr_1))) #calculate MRR mrrs_saving = pd.DataFrame(mrrs) mrrs_saving.to_csv(os.getcwd() + "/part_1/mrrs.csv", index=False) #store MRR table
def __init__(self, stored = False, lowercase = False, commas = False, scorable = False, unique = False, field_boost = 1.0): """ :stored: Whether to store the value of the field with the document. :comma: Whether this is a comma-separated field. If this is False (the default), it is treated as a space-separated field. :scorable: Whether this field is scorable. """ ana = KeywordAnalyzer(lowercase = lowercase, commas = commas) self.format = Frequency(analyzer = ana, field_boost = field_boost) self.scorable = scorable self.stored = stored self.unique = unique
def create(): """ Create a new Whoosh index.. """ print 'creating new index in directory %s' % DIRECTORY os.system('rm -rf %s' % DIRECTORY) os.mkdir(DIRECTORY) schema = Schema(source=ID(stored=True, unique=True), cached=ID(stored=True, unique=True), hash=ID(stored=True, unique=True), title=TEXT(stored=True), author=TEXT(stored=True), year=TEXT(stored=True), notes=TEXT(stored=True), text=TEXT(stored=True), tags=TEXT(stored=True, analyzer=KeywordAnalyzer()), added=DATETIME(stored=True), mtime=DATETIME(stored=True)) create_in(DIRECTORY, schema, NAME)
def queryRec(index, text): """ >> query(MyIndex, 'ciao') defaultdict(<class 'list'>, {'cia': ['059r/413_780_36_104.png', ((1, 2), (1, 3), (1, 4))], 'o': ('051r/751_1468_23_33.png', (-1, -1))}) # clearly tail ['cia', 'o'] :param index: whoosh index :param text: String. Input text :return: defaultditc, list. """ char2Images = defaultdict(list) # eg. 'a': 'path/image.png' orderedComps = [] # 'h', 'e', 'll', 'o' with open(connCompsRichJSON, 'r') as ccfile: inputTextCC = load(ccfile) getSubTokens = (lambda imgNcoord: [inputTextCC[imgNcoord[0]][cmp][pos] for cmp, pos in imgNcoord[1]]) with index.searcher() as searcher: qpHead = qparser.QueryParser('ccompsHead', index.schema) qpTail = qparser.QueryParser('ccompsTail', index.schema) qpHead.add_plugin(qparser.RegexPlugin()) qpTail.add_plugin(qparser.RegexPlugin()) analyze = KeywordAnalyzer() for token in analyze(text): t = token.text if t not in char2Images: # first, we search for all possible n-grams for a given token allGrams = [] for n in range(len(t)): for ngram in ngrams(t, len(t) - n): allGrams.append(''.join(str(i) for i in ngram)) """ positional indices for grams this will be used to find the "not longest" substring (length substr, offset Left, substr) """ indexGrams = zip( [(n + 1, j) for n in range(len(t)) for j in range(len(t) - n)[::-1]][::-1], allGrams ) # then we search the longest matching substring longestSubString = '' coord = None for lenStart, gram in indexGrams: if gram not in char2Images: q = qpHead.parse(gram) result = searcher.search(q) if result: randchoice = choice(list(result)) positions = choice(dict(randchoice['ccompsHeadTrace'])[gram]) char2Images[gram] = [randchoice['image'], getSubTokens((randchoice['image'], positions))] coord, longestSubString = lenStart, gram break else: coord, longestSubString = lenStart, gram break # rest of the string/token leftMiss = t[:coord[1]] rightMiss = t[coord[1] + coord[0]:] if leftMiss: result = find(qpHead, searcher, leftMiss, char2Images) for r in result: if r[0] not in char2Images.keys() and r[1] != '_': # duplicates? # 0=ccomp, 1=image, 2=headtrace char2Images[r[0]] = [r[1], getSubTokens((r[1], r[2]))] orderedComps.append(r[0]) # middle of the word orderedComps.append(longestSubString) if rightMiss: result = find(qpTail, searcher, rightMiss, char2Images, qpHead) for r in result: # ('al', 'dir/name.png', ((0, 1), (0, 2))) if r[0] not in char2Images.keys() and r[1] != '_': if r[2] == [(-1, -1)]: char2Images['_' + r[0]] = [r[1], [r[0]]] orderedComps.append('_' + r[0]) else: char2Images[r[0]] = [r[1], getSubTokens((r[1], r[2]))] orderedComps.append(r[0]) else: orderedComps.append(r[0]) else: orderedComps.append(t) orderedComps.append(' ') # space between words orderedComps.pop() # removes last space return char2Images, orderedComps
def query(index, text, forceHead=False): """ :param index: whoosh index :param text: str. Text so search in the index :param forceHead: bool. Only hallow non ending tk to be searched :return: a map and a list. The map has as keys string or substrings of text retrieved by the index and as values the images that contains the glyphs/connected components and these last as strings. Input: query("ae") Output: """ assert isinstance(forceHead, bool) char2Images = defaultdict(list) # eg. 'h': 'path/image.png' orderedComps = [] # 'h', 'e', 'll', 'o' with open(connCompsRichJSON, 'r') as ccfile: inputTextCC = load(ccfile) getSubTokens = (lambda imgNcoord: [inputTextCC[imgNcoord[0]][cmp][pos] for cmp, pos in imgNcoord[1]]) with index.searcher() as searcher: # qpHead = qparser.QueryParser('ccompsHead', index.schema) # qpTail = qparser.QueryParser('ccompsTail', index.schema) # qpHead.remove_plugin_class(qparser.WildcardPlugin) # qpTail.remove_plugin_class(qparser.WildcardPlugin) # qpHead.add_plugin(qparser.RegexPlugin()) # qpTail.add_plugin(qparser.RegexPlugin()) analyze = KeywordAnalyzer() for token in analyze(text): t = token.text if t not in char2Images: # first, we search for all possible n-grams for a given token # allGrams is the power set of the letters that make up the word allGrams = [] for n in range(len(t)): for ngram in ngrams(t, len(t) - n): allGrams.append(''.join(str(i) for i in ngram)) """ positional indices for grams this will be used to find the smaller substrings (length substr, offset Left, substr) """ indexGrams = list(zip( [(n + 1, j) for n in range(len(t)) for j in range(len(t) - n)[::-1]][::-1], allGrams )) tmp_ordComp = [] # sublist orderedDomps for current token collectedChar = 0 # whole word taken i = 0 while collectedChar < len(t) and i < len(indexGrams): lenStart, gram = indexGrams[i] _length, _start = lenStart prune = True inTail = False endGram = "" if gram not in char2Images: # tail or head, -3 = prevent from taking wronk tokens if _start in range(len(t) - 3, len(t)) and gram in ( 's', 'e', 'l', 't', 'm', 'n', 'q&', 'b&', '&', '1', '2') and not forceHead: q = Term("ccompsTail", gram) inTail = True endGram = "_" + gram elif _start in range(len(t) - 3, len(t)) and gram in ('q&', 'b&', '&', '1', '2') and forceHead: q = Term("ccompsTail", gram) inTail = True endGram = "_" + gram else: q = Term("ccompsHead", gram) result = searcher.search(q) # if gram in richTokensConv.values(): # print('\n//// ', gram, '\n', result, '\n\n') # handling results if result and endGram not in char2Images: randchoice = choice(list(result)) if inTail: char2Images[endGram] = [randchoice['image'], list(gram)] tmp_ordComp.append((lenStart, endGram)) else: try: positions = choice(dict(randchoice['ccompsHeadTrace'])[gram]) except KeyError: dict(randchoice['ccompsHeadTrace']) return try: char2Images[gram] = [randchoice['image'], getSubTokens((randchoice['image'], positions))] except IndexError: print('randchoice ', randchoice) print('positions', positions) print(inputTextCC[randchoice['image']]) print('_________\n\n') raise IndexError tmp_ordComp.append((lenStart, gram)) elif endGram in char2Images: tmp_ordComp.append((lenStart, endGram)) break else: prune = False else: # already taken tmp_ordComp.append((lenStart, gram)) if prune: collectedChar += _length pruned = [el for el in indexGrams[i + 1:] if not (_start <= el[0][1] < _length + _start or # start _start <= el[0][0] + el[0][1] - 1 < _length + _start) # end ] indexGrams = indexGrams[:i + 1] + pruned i += 1 orderedComps.extend([oc[1] for oc in sorted(tmp_ordComp, key=lambda x: x[0][1])]) else: orderedComps.append(t) orderedComps.append(' ') # space between words orderedComps.pop() # removes last space return char2Images, orderedComps
def query(index, text): """ :param index: whoosh index :param text: :return: """ char2Images = defaultdict(list) # eg. 'h': 'path/image.png' orderedComps = [] # 'h', 'e', 'll', 'o' with open(connCompsRichJSON, 'r') as ccfile: inputTextCC = load(ccfile) getSubTokens = ( lambda imgNcoord: [inputTextCC[imgNcoord[0]][cmp][pos] for cmp, pos in imgNcoord[1]]) with index.searcher() as searcher: qpHead = qparser.QueryParser('ccompsHead', index.schema) qpTail = qparser.QueryParser('ccompsTail', index.schema) qpHead.add_plugin(qparser.RegexPlugin()) qpTail.add_plugin(qparser.RegexPlugin()) analyze = KeywordAnalyzer() for token in analyze(text): t = token.text if t not in char2Images: # first, we search for all possible n-grams for a given token allGrams = [] for n in range(len(t)): for ngram in ngrams(t, len(t) - n): allGrams.append(''.join(str(i) for i in ngram)) """ positional indices for grams this will be used to find the smaller substrings (length substr, offset Left, substr) """ indexGrams = list( zip([(n + 1, j) for n in range(len(t)) for j in range(len(t) - n)[::-1]][::-1], allGrams)) tmp_ordComp = [] # sublist orderedDomps for current token collectedChar = 0 # whole word taken i = 0 while collectedChar < len(t) and i < len(indexGrams): lenStart, gram = indexGrams[i] _length, _start = lenStart prune = True inTail = False endGram = "" if gram not in char2Images: # tail or head, -3 = prevent from taking wronk tokens if _start in range(len(t) - 3, len(t)) and gram in ( 's', 'e', 'l', 't', 'm', 'n', 'que', 'bus', 'us', 'ue'): q = qpTail.parse(gram) inTail = True endGram = "_" + gram else: q = qpHead.parse(gram) result = searcher.search(q) # handling results if result and endGram not in char2Images: randchoice = choice(list(result)) if inTail: char2Images[endGram] = [ randchoice['image'], [gram] ] tmp_ordComp.append((lenStart, endGram)) else: positions = choice( dict(randchoice['ccompsHeadTrace'])[gram]) char2Images[gram] = [ randchoice['image'], getSubTokens( (randchoice['image'], positions)) ] tmp_ordComp.append((lenStart, gram)) elif endGram in char2Images: tmp_ordComp.append((lenStart, endGram)) break else: prune = False else: # already taken tmp_ordComp.append((lenStart, gram)) if prune: collectedChar += _length pruned = [ el for el in indexGrams[i + 1:] if not (_start <= el[0][1] < _length + _start or # start _start <= el[0][0] + el[0][1] - 1 < _length + _start) # end ] indexGrams = indexGrams[:i + 1] + pruned i += 1 orderedComps.extend([ oc[1] for oc in sorted(tmp_ordComp, key=lambda x: x[0][1]) ]) else: orderedComps.append(t) orderedComps.append(' ') # space between words orderedComps.pop() # removes last space return char2Images, orderedComps
from whoosh import index, scoring from whoosh.fields import Schema, TEXT, ID, STORED from whoosh.qparser import QueryParser from whoosh.analysis import SimpleAnalyzer, CharsetFilter, KeywordAnalyzer #from whoosh.support.charset import accent_map # ===== GLOBAL VARIABLES ======== file_dir = "files" index_dir = "index" response = [] schema_name = "search" schema = Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT(analyzer=KeywordAnalyzer())) #my_analyzer = SimpleAnalyzer() | CharsetFilter(accent_map) #schema_fuzzy = Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT(analyzer=my_analyzer)) # ========= REST API ============= app = flask.Flask(__name__) app.config["DEBUG"] = True # Home @app.route('/', methods=['GET']) def home(): return "<h1>Home</h1><p>Try to pass search terms with: http://127.0.0.1:5000/search?q=comma,separated,search,terms</p>"
# num_added_records_so_far += 1 if (num_added_records_so_far % 100 == 0): print(" num_added_records_so_far= " + str(num_added_records_so_far)) # writer.commit() # it is necessary to store the index once filled in_file.close() # it is necessary to close the .csv file ''' Here "schemas" function is used to create and fill all the schemas(indexes) for both .csv files (Cranfield.csv and Time.csv) ''' analyzers = [StemmingAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), SimpleAnalyzer(), FancyAnalyzer(), NgramAnalyzer(4), KeywordAnalyzer(), LanguageAnalyzer('en')] # all the analyzers that are used analyzer_names = ['StemmingAnalyzer', 'StandardAnalyzer', 'RegexAnalyzer', 'SimpleAnalyzer', 'FancyAnalyzer', 'NgramAnalyzer', 'KeywordAnalyzer', 'LanguageAnalyzer'] # analyzers names csv_names = ['Cranfield', 'Time'] # file names # start to iterate over all the .csv files (in particular the only two that there are, Cranfield.csv, and Time.csv) for name in csv_names: print(name, '\n\n') path = "C:./"+name+"_DATASET" # get the path where the .csv is stored for e,type_analyzer in enumerate(analyzers): # now the iteration is necessary to create the 8 different inverted indexes