Beispiel #1
0
    def __insert_journal(self):
        """add journal into venue index"""

        cprint('Journal commit started', 'pink')
        vix = open_dir(self.ven_index_path)
        writer = vix.writer()
        print('\tVenues count without journal: ' + str(vix.doc_count()))
        # writer.add_document(title=u"My document", content=u"This is my document!",
        #                     path=u"/a", tags=u"first short", icon=u"/icons/star.png")
        # f = open('jlist.txt', 'w')
        with open('jl.txt', 'r') as f:
            for line in f.readlines():
                line = line.split('~')

                writer.add_document(
                    key=line[0],
                    pubtype='journal',
                    title=line[1],
                    year=line[2],
                    url=line[5],
                    ee=line[6],
                    author='',
                    publisher='',
                    isbn='',
                )
        writer.commit()
        print('\tVenues count with journal: ' + str(vix.doc_count()))
        cprint('Journal commit ended', 'purple')
        os.remove('jl.txt')
Beispiel #2
0
def print_alternative(alt):
    """print the others pubs contained in a given venue"""

    cprint('Pubs Included', *alt_obj, start='\t')
    [cprint(p.strip(), *argument, start='\t- ') for p in alt[:10] if p != '']
    if len(alt) > 10:
        cprint(' ...', *argument, start='\t- ')
def menu_text(*args, start='', end='\n'):
    cprint(
        """
   ___  ___  __   ___    ____                 __     ____          _         
  / _ \/ _ )/ /  / _ \  / __/__ ___ _________/ /    / __/__  ___ _(_)__  ___ 
 / // / _  / /__/ ___/ _\ \/ -_) _ `/ __/ __/ _ \  / _// _ \/ _ `/ / _ \/ -_)
/____/____/____/_/    /___/\__/\_,_/_/  \__/_//_/ /___/_//_/\_, /_/_//_/\__/ 
                                                           /___/             
 """, *args, start='', end='\n')
def welcome_text(*args, start='', end='\n'):
    cprint(
        """
        ██╗    ██╗███████╗██╗      ██████╗ ██████╗ ███╗   ███╗███████╗
        ██║    ██║██╔════╝██║     ██╔════╝██╔═══██╗████╗ ████║██╔════╝
        ██║ █╗ ██║█████╗  ██║     ██║     ██║   ██║██╔████╔██║█████╗  
        ██║███╗██║██╔══╝  ██║     ██║     ██║   ██║██║╚██╔╝██║██╔══╝  
        ╚███╔███╔╝███████╗███████╗╚██████╗╚██████╔╝██║ ╚═╝ ██║███████╗
         ╚══╝╚══╝ ╚══════╝╚══════╝ ╚═════╝ ╚═════╝ ╚═╝     ╚═╝╚══════╝
        """, *args, start='', end='\n')
Beispiel #5
0
    def __indexing(self, handler, schema, parser, index_path):
        """a function that handles the index creation"""

        # ** returns dictionary as parameters
        writer = create_in(index_path, schema).writer(**self.__resources(self))

        parser.setContentHandler(handler(writer))
        parser.parse(self.db_path)

        if 'Pub' in index_path:
            cprint('Pubs commit started', 'green')
        else:
            cprint('Venues commit started.', 'lightcyan')

        writer.commit()

        if 'Pub' in index_path:
            cprint('Pubs commit ended.', 'green')
        else:
            cprint('Venues commit ended.', 'lightcyan')
Beispiel #6
0
def check_ixs(silent=False):
    """check if indexes has been created"""

    try:
        return check_open_ixs(silent=silent)
    except:
        while True:
            cprint('Indexes not found. Search Engine needs to create them.',
                   'orange', 'bold')
            db_path = input(form('Insert the DBLP file path: ', 'orange'))
            print()
            db_path = abspath(db_path)
            try:
                Index.create_ixs(Index(db_path))
            except:
                cprint(
                    'It seems there is an error with the path. Please retry',
                    'red', 'bold')
                continue
            try:
                return check_open_ixs()
            except:
                cprint('It seems there is an error.', 'red', 'bold')
            break
    def frequency(self, fuzzy):
        """ Used to get the rilevant documents using the frequency of the searched terms in the document.
            If you want to use fuzzy search of the query terms set fuzzy=True """

        pquery, vquery = to_whoosh_query(
            self.__ask_query())  # Get the query used in whoosh
        # Whoosh Frequency doesn't support the OR query, so it will be splitted to merge later.
        pquery = pquery.split(' OR ')
        vquery = vquery.split(' OR ')

        pqprint = set()
        vqprint = set()
        print()
        # ----------- PUBLICATIONS ----------------------
        with self.pix.searcher(weighting=Frequency) as ps:
            # "" search for phrase in which the maximum distance between each word is 1
            # '' if you have to include characters in a term that are normally threated specially by the parsers, such
            #   as spaces, colons, or brackets.
            presults = None
            for pq in pquery:
                if fuzzy:
                    pq_parse = QueryParser('title',
                                           self.pix.schema,
                                           termclass=FuzzyTerm).parse(pq)
                    pqprint.add(str(pq_parse))
                else:
                    pq_parse = QueryParser('title', self.pix.schema).parse(pq)
                    pqprint.add(str(pq_parse))

                if presults is not None:
                    tresult = ps.search(
                        pq_parse,
                        limit=None,
                    )
                    presults.upgrade_and_extend(tresult)
                else:
                    presults = ps.search(
                        pq_parse,
                        limit=None,
                    )

                if not pq.startswith(('title', 'author', 'year'), ):
                    if fuzzy:
                        pq_parse = QueryParser('author',
                                               self.pix.schema,
                                               termclass=FuzzyTerm).parse(pq)
                        pqprint.add(str(pq_parse))
                    else:
                        pq_parse = QueryParser('author',
                                               self.pix.schema).parse(pq)
                        pqprint.add(str(pq_parse))

                    tresult = ps.search(
                        pq_parse,
                        limit=None,
                    )
                    presults.upgrade_and_extend(tresult)
                    if fuzzy:
                        pq_parse = QueryParser('year',
                                               self.pix.schema,
                                               termclass=FuzzyTerm).parse(pq)
                        pqprint.add(str(pq_parse))
                    else:
                        pq_parse = QueryParser('year',
                                               self.pix.schema).parse(pq)
                        pqprint.add(str(pq_parse))

                    tresult = ps.search(
                        pq_parse,
                        limit=None,
                    )
                    presults.upgrade_and_extend(tresult)

            cprint("Pub Query: " + ' OR '.join(pqprint),
                   'lightgrey',
                   'italic',
                   start='\t')
            cprint('Publications found: ' + str(len(presults)),
                   'bold',
                   'lightgrey',
                   'url',
                   start='\t')
            plist = []
            for el in presults:
                tmp = {
                    'key': '',
                    'score': el.score,
                    'pub': {},
                    'ven': {},
                    'alternative': []
                }
                for attr in el.items():
                    tmp['pub'][attr[0]] = attr[1]
                tmp['pub']['o_score'] = tmp['score']
                plist.append(tmp)

        # --------------- VENUES --------------------------
        vresults = None
        with self.vix.searcher(weighting=Frequency) as vs:
            # print('1: ', vquery)
            for vq in vquery:
                # print('2: ', vq)
                if fuzzy:
                    vq_parse = QueryParser('title',
                                           self.vix.schema,
                                           termclass=FuzzyTerm).parse(vq)
                    vqprint.add(str(vq_parse))
                else:
                    vq_parse = QueryParser('title', self.vix.schema).parse(vq)
                    vqprint.add(str(vq_parse))

                if vresults is not None:
                    tresult = vs.search(vq_parse, limit=None)
                    vresults.upgrade_and_extend(tresult)
                else:
                    vresults = vs.search(vq_parse, limit=None)

                if not vq.startswith(('title:', 'publisher'), ):
                    if fuzzy:
                        vq_parse = QueryParser('publisher',
                                               self.vix.schema,
                                               termclass=FuzzyTerm).parse(vq)
                        vqprint.add(str(vq_parse))
                    else:
                        vq_parse = QueryParser('publisher',
                                               self.vix.schema).parse(vq)
                        vqprint.add(str(vq_parse))

                    tresult = vs.search(vq_parse, limit=None)
                    vresults.upgrade_and_extend(tresult)

            cprint("Ven Query: " + ' OR '.join(vqprint),
                   'lightgrey',
                   'italic',
                   start='\t')
            cprint('Venues found: ' + str(len(vresults)),
                   'bold',
                   'lightgrey',
                   'url',
                   start='\t')
            vlist = []
            for el in vresults:
                tmp = {
                    'key': '',
                    'score': el.score,
                    'ven': {},
                    'pub': {},
                    'alternative': []
                }
                for attr in el.items():
                    tmp['ven'][attr[0]] = attr[1]
                tmp['ven']['o_score'] = tmp['score']
                vlist.append(tmp)

        self.__results(plist, vlist)  # Call the function to print the results.
    def bm25f(self, fuzzy):
        """ Used to get the rilevant documents. This ranking method use the default whoosh ranking method.
            If you want to use fuzzy search of the query terms set fuzzy=True"""

        pquery, vquery = to_whoosh_query(
            self.__ask_query())  # Get the query used in whoosh
        print()

        # ----------- PUBLICATIONS ----------------------
        with self.pix.searcher() as ps:
            # "" search for phrase in which the maximum distance between each word is 1
            # '' if you have to include characters in a term that are normally threated specially by the parsers, such
            #   as spaces, colons, or brackets.

            if fuzzy:
                if 'pubtype' in pquery:  # to prevent a bad search on the search term
                    pquery = MultifieldParser(
                        ['author', 'title', 'year'],
                        self.pix.schema,
                        termclass=FuzzyTerm).parse(pquery)
                else:
                    pquery = MultifieldParser(
                        ['pubtype', 'author', 'title', 'year'],
                        self.pix.schema,
                        termclass=FuzzyTerm).parse(pquery)
            else:
                if 'pubtype' in pquery:  # to prevent a bad search on the search term
                    pquery = MultifieldParser(['author', 'title', 'year'],
                                              self.pix.schema).parse(pquery)
                else:
                    pquery = MultifieldParser(
                        ['pubtype', 'author', 'title', 'year'],
                        self.pix.schema).parse(pquery)

            cprint("Pub Query: " + str(pquery),
                   'lightgrey',
                   'italic',
                   start='\t')
            presults = ps.search(pquery, limit=None)
            cprint('Publications found: ' + str(len(presults)),
                   'bold',
                   'lightgrey',
                   'url',
                   start='\t')
            plist = []
            for el in presults:
                tmp = {
                    'key': '',
                    'score': el.score,
                    'pub': {},
                    'ven': {},
                    'alternative': []
                }
                for attr in el.items():
                    tmp['pub'][attr[0]] = attr[1]
                tmp['pub']['o_score'] = tmp['score']
                plist.append(tmp)

        # --------------- VENUES --------------------------
        with self.vix.searcher() as vs:
            if fuzzy:
                vquery = MultifieldParser(['title', 'publisher'],
                                          self.vix.schema,
                                          termclass=FuzzyTerm).parse(vquery)
            else:
                vquery = MultifieldParser(['title', 'publisher'],
                                          self.vix.schema).parse(vquery)

            cprint("Ven Query: " + str(vquery),
                   'lightgrey',
                   'italic',
                   start='\t')
            vresults = vs.search(vquery, limit=None)
            cprint('Venues found: ' + str(len(vresults)),
                   'bold',
                   'lightgrey',
                   'url',
                   start='\t')
            vlist = []
            for el in vresults:
                tmp = {
                    'key': '',
                    'score': el.score,
                    'ven': {},
                    'pub': {},
                    'alternative': []
                }
                for attr in el.items():
                    tmp['ven'][attr[0]] = attr[1]
                tmp['ven']['o_score'] = tmp['score']
                vlist.append(tmp)
        self.__results(plist, vlist)  # Call the function to print the results.
    def __results(self, plist, vlist):
        """ Used at the end of the ranking function to mix the two indexes results and show only the relevants ones."""

        plist = sorted(plist, key=lambda s: s['score'], reverse=True)
        vlist = sorted(vlist, key=lambda s: s['score'], reverse=True)

        if len(plist) == 0:
            for el in vlist:
                el['key'] = el['ven']['key']
            results = vlist
        elif len(vlist) == 0:
            for el in plist:
                el['key'] = el['pub']['key']
            results = plist
        else:
            results = tr(plist, vlist)

        # merge publications that have the same crossref
        same_venue = list()
        end_cycle = len(results)
        end_tot = 0
        for r in results:
            if end_tot >= end_cycle:
                break
            if len(r['pub']) and len(r['ven']):
                if len(same_venue):
                    id = None
                    f = False
                    for i in range(len(same_venue)):
                        if same_venue[i]['key'] == r['ven']['key']:
                            f = True  # found
                            id = i  # position
                            break
                    if not f:
                        same_venue.append({
                            'key': r['ven']['key'],
                            'index': results.index(r)
                        })
                    elif isinstance(results[id]['pub'],
                                    dict):  # create a new element
                        tmp = {
                            'key':
                            r['ven']['key'],
                            'score':
                            r['pub']['o_score'] +
                            results[same_venue[id]['index']]['score'],
                            'pub': [
                                r['pub'],
                                results[same_venue[id]['index']]['pub'],
                            ],
                            'ven':
                            r['ven'],
                            'alternative': [],
                        }
                        del results[
                            id]  # remove the id element and the actual element
                        results.remove(r)
                        results.append(tmp)  # add the element created
                        same_venue[id]['index'] = results.index(
                            tmp)  # update the index
                        end_cycle -= 2  # due to the remotion of the 2 elements
                    else:
                        results[id]['pub'].append(r['pub'])
                        results[id]['score'] += r['pub']['o_score']
                        results.remove(r)
                        end_cycle -= 1  # due to the remotion of the element
                else:
                    same_venue.append({
                        'key': r['ven']['key'],
                        'index': results.index(r)
                    })

            end_tot += 1
        results = sorted(results, key=lambda s: s['score'], reverse=True)

        # find correlations
        if self.__output_level == 3:
            self.__find_correlations(results)
        else:
            self.__output = results

        cprint('RESULTS:', 'yellow', 'bold', 'url', start='\n\t', end='\n\n')
        count = 0
        for element in self.__output:
            if count == self.__result_limit:
                break
            q_print(element, count + 1, self.__output_level)
            count += 1

        self.__output = list()
Beispiel #10
0
def q_print(element, count, level):
    """ This function provide the documents output to the user."""

    # - pub in venue --> score = pub.score
    if len(element['alternative']) == 0 and (
            (isinstance(element['pub'], dict) and len(element['pub'])) or (
                    isinstance(element['pub'], list) and len(element['pub']) == 1)):
        cprint(' ' * 2 + str(count) + ')\t' + 'score: ' + str(round(element['score'], 5)), *score)
        cprint('Publication', *main_obj, start='\t')
        if isinstance(element['pub'], dict):
            print_pub(element['pub'], level)
        else:
            print_pub(element['pub'][0], level)

        if len(element['ven']) and level >= 2:
            if 'added' in element.keys():
                cprint('In Venue', *alt_obj, start='\n\t')
            else:
                cprint('In Relevant Venue', *alt_obj, start='\n\t')
            print_inven(element['ven'], level)

    # - venue con alternative --> score = venue.score
    elif len(element['pub']) == 0:
        cprint(' ' * 2 + str(count) + ')\t' + 'score: ' + str(round(element['score'], 5)), *score)
        # ------- Venue -----------------
        cprint('Venue', *main_obj, start='\t')
        print_venue(element['ven'], level)

        # alternative
        if len(element['alternative']) and level >= 3:
            print_alternative(element['alternative'])

    # - venue con pubs e alternative --> score = original(venue.score + pubs.score)
    else:
        s = element['ven']['o_score']
        for x in element['pub']:
            s += x['o_score']

        cprint(' ' * 2 + str(count) + ')\t' + 'score: ' + str(round(s, 5)), *score)

        cprint('Venue', *main_obj, start='\t')
        print_venue(element['ven'], level)
        print()
        cprint('Relevant Publications', *main_obj, start='\t')
        for pub in element['pub']:
            print_pub(pub, level)
            print()

        # alternative
        if len(element['alternative']) and level >= 3:
            print_alternative(element['alternative'])

    print()
Beispiel #11
0
    def start(self):
        """a function that starts the menu loop"""
        check_ixs()
        while True:
            cprint('MAIN MENU\n', 'green', 'bold', 'url', start='\n\t')
            for choice in self.__choices_list:
                print(form(choice[0], *self.__colornumber), form(choice[1], *self.__colortext))
            self.__last_selected = input(form('\nType your choice:\n>  ', *self.__colorinput))

            # ----------- Search ---------------------1
            if self.__last_selected == '1':
                try:
                    rank = Rank(self.__result_limit, self.__output_level)
                    if self.__ranking == 'frequency':
                        rank.frequency(self.__fuzzy)
                    else:
                        rank.bm25f(self.__fuzzy)
                except:
                    cprint('Please, retry using the right sintax.', 'orange', 'bold', 'url', start='\r\t', end='\n\n')

            # ------------ Settings ---------------------
            elif self.__last_selected == '2':
                for option in self.__options_list:
                    print(form(option[0], *self.__colornumber), form(option[1], *self.__colortext))
                c = input(form('\nWhich options do you want to edit?\n>  ', *self.__colorinput))
                if c == '1':
                    for rank in self.__ranking_list:
                        print(form(rank[0], *self.__colornumber), form(rank[1], *self.__colortext))
                    c = input(form('\nWhich options do you want to choose?\n>  ', *self.__colorinput))
                    if c == '2':
                        self.__ranking = 'frequency'
                    else:
                        self.__ranking = 'bm25f'
                elif c == '2':
                    limit = input(form('\nHow many results do you want to print?\n>  ', *self.__colorinput))
                    self.__result_limit = int(limit)
                elif c == '3':
                    print('Fuzzyterm: ', self.__fuzzy)
                    c = input(form('\nDo you want to change it? [y/n]\n>  ', *self.__colorinput))
                    if c == 'y':
                        self.__fuzzy = not self.__fuzzy
                elif c == '4':
                    for level in self.__level_list:
                        print(form(level[0], *self.__colornumber), form(level[1], *self.__colortext))
                    c = input(form('\nWhich options do you want to choose?\n>  ', *self.__colorinput))
                    if c in [x[0].replace('. ', '') for x in self.__level_list]:
                        self.__output_level = int(c)
                elif c == '5':
                    self.reset()

            # ------- Print Settings ----------------------
            elif self.__last_selected == '3':
                o_color_key = ('pink', 'bold',)
                o_color_value = ('pink', 'italic',)
                cprint('Options: ', *o_color_key, start='\n')
                print('\t{}{}'.format(form('Ranking: ', *o_color_key),
                                      form(self.__ranking, *o_color_value)))
                print('\t{}{}'.format(form('Results limit: ', *o_color_key),
                                      form(self.__result_limit, *o_color_value)))
                print('\t{}{}'.format(form('Fuzzy: ', *o_color_key),
                                      form(self.__fuzzy, *o_color_value)))
                print('\t{}{}'.format(form('Output level: ', *o_color_key),
                                      form(self.__output_level, *o_color_value)))
                print()
            # --------- Exit -------------------------
            elif self.__last_selected == '4':
                return

            else:
                cprint('Try again, you will be luckier!', 'orange', 'bold', 'url', start='\t', end='\n\n')