Ejemplo n.º 1
0
    def __init__(self,
                 stored=False,
                 lowercase=False,
                 commas=False,
                 vector=None,
                 scorable=False,
                 unique=False,
                 field_boost=1.0,
                 spelling=False):
        """
        :param stored: Whether to store the value of the field with the
            document.
        :param comma: Whether this is a comma-separated field. If this is False
            (the default), it is treated as a space-separated field.
        :param scorable: Whether this field is scorable.
        """

        self.analyzer = KeywordAnalyzer(lowercase=lowercase, commas=commas)
        self.format = formats.Frequency(field_boost=field_boost)
        self.scorable = scorable
        self.stored = stored
        self.unique = unique
        self.spelling = spelling

        if vector:
            if type(vector) is type:
                vector = vector()
            elif isinstance(vector, formats.Format):
                pass
            else:
                vector = self.format
        else:
            vector = None
        self.vector = vector
Ejemplo n.º 2
0
def exec_comp():
    '''
    Method that calculates MRR: Mean Reciprocal Rank and saves a table with MRR evaluation for every search engine configuration 
    '''
    #text analyzers
    selected_analyzers = [
        StemmingAnalyzer(),
        SimpleAnalyzer(),
        StandardAnalyzer(),
        RegexAnalyzer(),
        FancyAnalyzer(),
        NgramAnalyzer(5),
        KeywordAnalyzer(),
        LanguageAnalyzer('en')
    ]  #text analyzers
    sel_ana = [
        'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()',
        'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)',
        'KeywordAnalyzer()', 'LanguageAnalyzer()'
    ]  #text which will be used for graph and for mrr table

    i = 0  #counter
    mrrs = []  #list where MRR values for each SE configuration will be stored

    #scoring functions
    scoring_functions = [
        scoring.TF_IDF(),
        scoring.Frequency(),
        scoring.BM25F(B=0.75, content_B=1.0, K1=1.5)
    ]
    scor_func = [' TF_IDF', ' Frequency', ' BM25F']

    #ground truth
    gt1 = pd.read_csv(os.getcwd() +
                      "/part_1/Cranfield_DATASET/cran_Ground_Truth.tsv",
                      sep='\t')

    #combinations for every chosen analyzer with every chosen scoring function
    for x in range(len(selected_analyzers)):
        for y in range(len(scoring_functions)):
            print(sel_ana[x] + scor_func[y])
            i = i + 1
            sr_1 = exec_queries(
                selected_analyzers[x], scoring_functions[y]
            )  # execute queries for the chosen configuration combination
            sr_1.to_csv(os.getcwd() + "/part_1/" + str(i) + "__.csv",
                        index=False)  #save results of the search engine
            mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1,
                                                        sr_1)))  #calculate MRR
    mrrs_saving = pd.DataFrame(mrrs)
    mrrs_saving.to_csv(os.getcwd() + "/part_1/mrrs.csv",
                       index=False)  #store MRR table
Ejemplo n.º 3
0
 def __init__(self, stored = False, lowercase = False, commas = False,
              scorable = False, unique = False, field_boost = 1.0):
     """
     :stored: Whether to store the value of the field with the document.
     :comma: Whether this is a comma-separated field. If this is False
         (the default), it is treated as a space-separated field.
     :scorable: Whether this field is scorable.
     """
     
     ana = KeywordAnalyzer(lowercase = lowercase, commas = commas)
     self.format = Frequency(analyzer = ana, field_boost = field_boost)
     self.scorable = scorable
     self.stored = stored
     self.unique = unique
Ejemplo n.º 4
0
def create():
    """ Create a new Whoosh index.. """
    print 'creating new index in directory %s' % DIRECTORY
    os.system('rm -rf %s' % DIRECTORY)
    os.mkdir(DIRECTORY)
    schema = Schema(source=ID(stored=True, unique=True),
                    cached=ID(stored=True, unique=True),
                    hash=ID(stored=True, unique=True),
                    title=TEXT(stored=True),
                    author=TEXT(stored=True),
                    year=TEXT(stored=True),
                    notes=TEXT(stored=True),
                    text=TEXT(stored=True),
                    tags=TEXT(stored=True, analyzer=KeywordAnalyzer()),
                    added=DATETIME(stored=True),
                    mtime=DATETIME(stored=True))
    create_in(DIRECTORY, schema, NAME)
Ejemplo n.º 5
0
def queryRec(index, text):
    """
    >> query(MyIndex, 'ciao')

            defaultdict(<class 'list'>,
                      {'cia': ['059r/413_780_36_104.png', ((1, 2), (1, 3), (1, 4))],
                      'o': ('051r/751_1468_23_33.png', (-1, -1))})                      # clearly tail

            ['cia', 'o']

    :param index: whoosh index
    :param text: String. Input text
    :return: defaultditc, list.

    """
    char2Images = defaultdict(list)  # eg. 'a': 'path/image.png'
    orderedComps = []  # 'h', 'e', 'll', 'o'

    with open(connCompsRichJSON, 'r') as ccfile:
        inputTextCC = load(ccfile)

    getSubTokens = (lambda imgNcoord: [inputTextCC[imgNcoord[0]][cmp][pos] for cmp, pos in imgNcoord[1]])

    with index.searcher() as searcher:
        qpHead = qparser.QueryParser('ccompsHead', index.schema)
        qpTail = qparser.QueryParser('ccompsTail', index.schema)
        qpHead.add_plugin(qparser.RegexPlugin())
        qpTail.add_plugin(qparser.RegexPlugin())
        analyze = KeywordAnalyzer()

        for token in analyze(text):
            t = token.text
            if t not in char2Images:
                # first, we search for all possible n-grams for a given token
                allGrams = []
                for n in range(len(t)):
                    for ngram in ngrams(t, len(t) - n):
                        allGrams.append(''.join(str(i) for i in ngram))

                """
                positional indices for grams
                this will be used to find the "not longest" substring
                (length substr, offset Left, substr)
                """
                indexGrams = zip(
                    [(n + 1, j) for n in range(len(t)) for j in range(len(t) - n)[::-1]][::-1],
                    allGrams
                )

                # then we search the longest matching substring
                longestSubString = ''
                coord = None

                for lenStart, gram in indexGrams:
                    if gram not in char2Images:
                        q = qpHead.parse(gram)
                        result = searcher.search(q)
                        if result:
                            randchoice = choice(list(result))
                            positions = choice(dict(randchoice['ccompsHeadTrace'])[gram])
                            char2Images[gram] = [randchoice['image'], getSubTokens((randchoice['image'], positions))]
                            coord, longestSubString = lenStart, gram
                            break
                    else:
                        coord, longestSubString = lenStart, gram
                        break

                # rest of the string/token
                leftMiss = t[:coord[1]]
                rightMiss = t[coord[1] + coord[0]:]

                if leftMiss:
                    result = find(qpHead, searcher, leftMiss, char2Images)
                    for r in result:
                        if r[0] not in char2Images.keys() and r[1] != '_':  # duplicates?
                            # 0=ccomp, 1=image, 2=headtrace
                            char2Images[r[0]] = [r[1], getSubTokens((r[1], r[2]))]
                        orderedComps.append(r[0])

                # middle of the word
                orderedComps.append(longestSubString)

                if rightMiss:
                    result = find(qpTail, searcher, rightMiss, char2Images, qpHead)
                    for r in result:
                        # ('al', 'dir/name.png', ((0, 1), (0, 2)))
                        if r[0] not in char2Images.keys() and r[1] != '_':
                            if r[2] == [(-1, -1)]:
                                char2Images['_' + r[0]] = [r[1], [r[0]]]
                                orderedComps.append('_' + r[0])
                            else:
                                char2Images[r[0]] = [r[1], getSubTokens((r[1], r[2]))]
                                orderedComps.append(r[0])
                        else:
                            orderedComps.append(r[0])
            else:
                orderedComps.append(t)
            orderedComps.append(' ')  # space between words

        orderedComps.pop()  # removes last space

    return char2Images, orderedComps
Ejemplo n.º 6
0
def query(index, text, forceHead=False):
    """

    :param index: whoosh index
    :param text: str. Text so search in the index
    :param forceHead: bool. Only hallow non ending tk to be searched
    :return: a map and a list.
            The map has as keys string or substrings of text retrieved by the index and as values the images
            that contains the glyphs/connected components and these last as strings.


            Input: query("ae")
            Output: 

    """
    assert isinstance(forceHead, bool)
    char2Images = defaultdict(list)  # eg. 'h': 'path/image.png'
    orderedComps = []  # 'h', 'e', 'll', 'o'

    with open(connCompsRichJSON, 'r') as ccfile:
        inputTextCC = load(ccfile)

    getSubTokens = (lambda imgNcoord: [inputTextCC[imgNcoord[0]][cmp][pos] for cmp, pos in imgNcoord[1]])

    with index.searcher() as searcher:
        # qpHead = qparser.QueryParser('ccompsHead', index.schema)
        # qpTail = qparser.QueryParser('ccompsTail', index.schema)
        # qpHead.remove_plugin_class(qparser.WildcardPlugin)
        # qpTail.remove_plugin_class(qparser.WildcardPlugin)
        # qpHead.add_plugin(qparser.RegexPlugin())
        # qpTail.add_plugin(qparser.RegexPlugin())
        analyze = KeywordAnalyzer()

        for token in analyze(text):
            t = token.text
            if t not in char2Images:
                # first, we search for all possible n-grams for a given token
                # allGrams is the power set of the letters that make up the word
                allGrams = []
                for n in range(len(t)):
                    for ngram in ngrams(t, len(t) - n):
                        allGrams.append(''.join(str(i) for i in ngram))

                """
                positional indices for grams
                this will be used to find the smaller substrings
                (length substr, offset Left, substr)

                """
                indexGrams = list(zip(
                    [(n + 1, j) for n in range(len(t)) for j in range(len(t) - n)[::-1]][::-1],
                    allGrams
                ))

                tmp_ordComp = []   # sublist orderedDomps for current token
                collectedChar = 0  # whole word taken
                i = 0

                while collectedChar < len(t) and i < len(indexGrams):
                    lenStart, gram = indexGrams[i]
                    _length, _start = lenStart
                    prune = True
                    inTail = False
                    endGram = ""

                    if gram not in char2Images:
                        # tail or head, -3 = prevent from taking wronk tokens
                        if _start in range(len(t) - 3, len(t)) and gram in (
                                's', 'e', 'l', 't', 'm', 'n', 'q&', 'b&', '&', '1', '2') and not forceHead:
                            q = Term("ccompsTail", gram)
                            inTail = True
                            endGram = "_" + gram
                        elif _start in range(len(t) - 3, len(t)) and gram in ('q&', 'b&', '&', '1', '2') and forceHead:
                            q = Term("ccompsTail", gram)
                            inTail = True
                            endGram = "_" + gram
                        else:
                            q = Term("ccompsHead", gram)
                        result = searcher.search(q)

                        # if gram in richTokensConv.values():
                        #     print('\n//// ', gram, '\n', result, '\n\n')

                        # handling results
                        if result and endGram not in char2Images:
                            randchoice = choice(list(result))

                            if inTail:
                                char2Images[endGram] = [randchoice['image'], list(gram)]
                                tmp_ordComp.append((lenStart, endGram))
                            else:
                                try:
                                    positions = choice(dict(randchoice['ccompsHeadTrace'])[gram])
                                except KeyError:
                                    dict(randchoice['ccompsHeadTrace'])
                                    return
                                try:
                                    char2Images[gram] = [randchoice['image'], getSubTokens((randchoice['image'], positions))]
                                except IndexError:
                                    print('randchoice ', randchoice)
                                    print('positions', positions)
                                    print(inputTextCC[randchoice['image']])
                                    print('_________\n\n')
                                    raise IndexError
                                tmp_ordComp.append((lenStart, gram))

                        elif endGram in char2Images:
                            tmp_ordComp.append((lenStart, endGram))
                            break
                        else:
                            prune = False
                    else:
                        # already taken
                        tmp_ordComp.append((lenStart, gram))

                    if prune:
                        collectedChar += _length
                        pruned = [el for el in indexGrams[i + 1:]
                                  if not (_start <= el[0][1] < _length + _start or               # start
                                          _start <= el[0][0] + el[0][1] - 1 < _length + _start)  # end
                                  ]
                        indexGrams = indexGrams[:i + 1] + pruned

                    i += 1

                orderedComps.extend([oc[1] for oc in sorted(tmp_ordComp, key=lambda x: x[0][1])])
            else:
                orderedComps.append(t)
            orderedComps.append(' ')  # space between words

        orderedComps.pop()  # removes last space
        return char2Images, orderedComps
Ejemplo n.º 7
0
def query(index, text):
    """

    :param index: whoosh index
    :param text:
    :return:
    """
    char2Images = defaultdict(list)  # eg. 'h': 'path/image.png'
    orderedComps = []  # 'h', 'e', 'll', 'o'

    with open(connCompsRichJSON, 'r') as ccfile:
        inputTextCC = load(ccfile)

    getSubTokens = (
        lambda imgNcoord:
        [inputTextCC[imgNcoord[0]][cmp][pos] for cmp, pos in imgNcoord[1]])
    with index.searcher() as searcher:
        qpHead = qparser.QueryParser('ccompsHead', index.schema)
        qpTail = qparser.QueryParser('ccompsTail', index.schema)
        qpHead.add_plugin(qparser.RegexPlugin())
        qpTail.add_plugin(qparser.RegexPlugin())
        analyze = KeywordAnalyzer()

        for token in analyze(text):
            t = token.text
            if t not in char2Images:
                # first, we search for all possible n-grams for a given token
                allGrams = []
                for n in range(len(t)):
                    for ngram in ngrams(t, len(t) - n):
                        allGrams.append(''.join(str(i) for i in ngram))
                """
                positional indices for grams
                this will be used to find the smaller substrings
                (length substr, offset Left, substr)
                """
                indexGrams = list(
                    zip([(n + 1, j) for n in range(len(t))
                         for j in range(len(t) - n)[::-1]][::-1], allGrams))

                tmp_ordComp = []  # sublist orderedDomps for current token
                collectedChar = 0  # whole word taken
                i = 0

                while collectedChar < len(t) and i < len(indexGrams):
                    lenStart, gram = indexGrams[i]
                    _length, _start = lenStart
                    prune = True
                    inTail = False
                    endGram = ""

                    if gram not in char2Images:
                        # tail or head, -3 = prevent from taking wronk tokens
                        if _start in range(len(t) - 3, len(t)) and gram in (
                                's', 'e', 'l', 't', 'm', 'n', 'que', 'bus',
                                'us', 'ue'):
                            q = qpTail.parse(gram)
                            inTail = True
                            endGram = "_" + gram
                        else:
                            q = qpHead.parse(gram)
                        result = searcher.search(q)

                        # handling results
                        if result and endGram not in char2Images:
                            randchoice = choice(list(result))

                            if inTail:
                                char2Images[endGram] = [
                                    randchoice['image'], [gram]
                                ]
                                tmp_ordComp.append((lenStart, endGram))
                            else:
                                positions = choice(
                                    dict(randchoice['ccompsHeadTrace'])[gram])
                                char2Images[gram] = [
                                    randchoice['image'],
                                    getSubTokens(
                                        (randchoice['image'], positions))
                                ]
                                tmp_ordComp.append((lenStart, gram))

                        elif endGram in char2Images:
                            tmp_ordComp.append((lenStart, endGram))
                            break
                        else:
                            prune = False
                    else:
                        # already taken
                        tmp_ordComp.append((lenStart, gram))

                    if prune:
                        collectedChar += _length
                        pruned = [
                            el for el in indexGrams[i + 1:]
                            if not (_start <= el[0][1] < _length +
                                    _start or  # start
                                    _start <= el[0][0] + el[0][1] -
                                    1 < _length + _start)  # end
                        ]
                        indexGrams = indexGrams[:i + 1] + pruned

                    i += 1

                orderedComps.extend([
                    oc[1] for oc in sorted(tmp_ordComp, key=lambda x: x[0][1])
                ])
            else:
                orderedComps.append(t)
            orderedComps.append(' ')  # space between words

        orderedComps.pop()  # removes last space
        return char2Images, orderedComps
Ejemplo n.º 8
0
from whoosh import index, scoring
from whoosh.fields import Schema, TEXT, ID, STORED
from whoosh.qparser import QueryParser
from whoosh.analysis import SimpleAnalyzer, CharsetFilter, KeywordAnalyzer
#from whoosh.support.charset import accent_map

# ===== GLOBAL VARIABLES ========

file_dir = "files"
index_dir = "index"
response = []
schema_name = "search"
schema = Schema(path=ID(unique=True, stored=True),
                time=STORED,
                content=TEXT(analyzer=KeywordAnalyzer()))

#my_analyzer = SimpleAnalyzer() | CharsetFilter(accent_map)
#schema_fuzzy = Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT(analyzer=my_analyzer))

# ========= REST API =============
app = flask.Flask(__name__)
app.config["DEBUG"] = True


# Home
@app.route('/', methods=['GET'])
def home():
    return "<h1>Home</h1><p>Try to pass search terms with: http://127.0.0.1:5000/search?q=comma,separated,search,terms</p>"

Ejemplo n.º 9
0
        #
        num_added_records_so_far += 1
        if (num_added_records_so_far % 100 == 0):
            print(" num_added_records_so_far= " + str(num_added_records_so_far))
    #
    writer.commit()  # it is necessary to store the index once filled
    in_file.close()  # it is necessary to close the .csv file


'''
Here "schemas" function is used to create and fill all the schemas(indexes) for both .csv files (Cranfield.csv and Time.csv)

'''

analyzers = [StemmingAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), SimpleAnalyzer(),
             FancyAnalyzer(), NgramAnalyzer(4), KeywordAnalyzer(), LanguageAnalyzer('en')] # all the analyzers that are used
analyzer_names = ['StemmingAnalyzer', 'StandardAnalyzer', 'RegexAnalyzer', 'SimpleAnalyzer',
                 'FancyAnalyzer', 'NgramAnalyzer', 'KeywordAnalyzer',  'LanguageAnalyzer'] # analyzers names

csv_names = ['Cranfield', 'Time'] # file names



# start to iterate over all the .csv files (in particular the only two that there are, Cranfield.csv, and Time.csv)
for name in csv_names: 
    
    print(name, '\n\n')
    
    path = "C:./"+name+"_DATASET" # get the path where the .csv is stored
    for e,type_analyzer in enumerate(analyzers): # now the iteration is necessary to create the 8 different inverted indexes