def __init__(self):

        conf = Config()

        config = conf.getConfig()
        config = namedtuple("Conf",
                            config['run'].keys())(*config['run'].values())

        conf.setOthers(['k' + str(config.k), 'b' + str(config.b)])

        t = time.time()
        parser = XmlsParser("../data/coll")
        parser.parse()
        with open("./data.json", "w") as fi:
            #    yaml.dump(parser.corpusWcount, ymlfile)
            #ymlfile.close()
            json.dump(parser.corpusWcount, fi)
        #print(time.time()-t)
        rank = XmlRank(parser.corpusWcount)
        query = [[2009011, ["olive", "oil", "health"]],
                 [2009036, ["notting", "hill", "film", "actors"]],
                 [
                     2009067,
                     [
                         "probabilistic", "models", "in", "information",
                         "retrieval"
                     ]
                 ], [2009073, ["web", "link", "network", "analysis"]],
                 [2009074, ["web", "ranking", "scoring", "algorithm"]],
                 [2009078, ["supervised", "machine", "learning", "algorithm"]],
                 [2009085, ["operating", "system", "mutual", "exclusion"]]]
        file = ""
        func = getattr(importlib.import_module("src.rank"), config.weighting)
        for q in query:
            r = rank.getBm25(request=q[1],
                             func=func,
                             k=config.k,
                             b=config.b,
                             d=0)
            run = Run("".join(config.staff), config.step, config.num,
                      config.weighting, config.granularity, config.others)
            file = run.createRun("../runs", r, q[0])
        conf.incrementRun()

        files = ["../runs/{}".format(config.compare), file]
        compare = Compare()
        df = compare.compare(files[0], files[1], 7, 20)
        df.style.apply(lambda i: 'color : red' if i < 0 else 'color : green',
                       subset=['diff'])
        print(df.to_string(max_rows=1000))

        #df.plot(use_index=True,y='diff', kind="line")
        fig, axes = plt.subplots(nrows=2, ncols=2)
        df.plot(use_index=True, y=['pos2'], kind="line", ax=axes[0, 0])
        axes[0, 0].legend(['file 2'])
        df.plot(use_index=True, y=['pos2', 'pos'], kind="line", ax=axes[0, 1])
        axes[0, 1].legend(['file 2', 'file'])
        df.plot(use_index=True, y=['diff'], kind="line", ax=axes[1, 0])
        axes[1, 0].legend(['difference'])
        df.plot(use_index=True,
                y=['pos', 'pos2'],
                kind="density",
                ax=axes[1, 1])
        axes[1, 1].legend(['file 1', 'file 2'])
        #plt.xlim(-50, 50)
        plt.xticks(rotation=65)
        plt.show()
        ax = plt.axes(projection='3d')
        zline = df['diff'].tolist()
        xline = df['pos2'].tolist()
        yline = df['pos'].tolist()
        ax.set_xlabel('diff')
        ax.set_ylabel('file 1')
        ax.set_zlabel('file 2')
        ax.scatter3D(xline, yline, zline)
        plt.show()
Example #2
0
class Main:
    def __init__(self):
        self.conf = Config()
        #self.docs = []
        self.docs = dict()

    def createData(self):
        config = self.conf.getConfig()
        config = namedtuple("Conf",
                            config['run'].keys())(*config['run'].values())
        xmlsf = "../{}/*.xml".format(config.data['path'])
        xmls = glob.glob(xmlsf)
        for (i, xml) in enumerate(xmls):
            e = self.elem(xml)
            self.docs.update(e)
            percent = i / len(xmls) * 100
            print("\r{}files reading : {} {}  {} {:3.2f}%".format(
                Fore.CYAN, Fore.BLUE, xml.ljust(30),
                Fore.RED if percent < 100 else Fore.LIGHTGREEN_EX, percent),
                  end='')
        print("\r{}files reading : {} {}  {} {:3.2f}%".format(
            Fore.CYAN, Fore.BLUE, xml.ljust(30),
            Fore.RED if percent < 100 else Fore.LIGHTGREEN_EX, percent),
              end='')
        print(Fore.RESET)
        with open("../data/data.json", "w") as fi:
            json.dump(self.docs, fi)
        return self.docs

    def s(self, t, size=80):
        r = dict()
        stop_words = set(stopwords.words("english"))
        self.s_r(t, r, stop_words, size=size)
        return r

    def s_r(self, t, r, stop_words, size=80, node=None):
        if isinstance(t, dict):
            for (k, v) in t.items():
                for i, j in enumerate(v, 0):
                    words = text_to_word_sequence(clean_text(getFullText(j)))
                    words = [
                        w.lower() for w in words if not w in stop_words
                        and w != '' and w.isalpha() and len(w) > 1
                    ]
                    node = node if node != None else "/"
                    if len(words) > size:
                        if k == "#text":
                            r[node] = words
                        else:
                            r["{}/{}[{}]".format(node, k, i + 1)] = words
                        self.s_r(j,
                                 r,
                                 stop_words,
                                 size,
                                 node="{}/{}[{}]".format(node, k, i + 1))

        return r

    def elem(self, xml):

        config = self.conf.getConfig()
        config = namedtuple("Conf",
                            config['run'].keys())(*config['run'].values())
        with open(xml, encoding="utf-8") as fd:
            tree = xmltodict.parse(fd.read(),
                                   xml_attribs=False,
                                   force_list=True)
            '''document = getFullText(tree)
            text = clean_text(document)
            words = text_to_word_sequence(text)'''
            ra = self.s(tree, 30)
            doc_id = search(tree, "id")[0]

            return {doc_id: ra}

    def load(self):
        config = self.conf.getConfig()
        dataf = "../{}".format(config['run']["data"]['file'])
        if not path.exists(
                dataf) or config['run']['data']['overwrite'] is True:
            self.createData()
        if len(self.docs) == 0:
            with open(dataf) as f:
                self.docs = json.load(f)

    def __articles(self, query) -> list:
        config = self.conf.getConfig()
        config = namedtuple("Conf",
                            config['run'].keys())(*config['run'].values())
        req = dict()
        for (k, v) in self.docs.items():
            if '//article[1]' in v:
                req[k] = v['//article[1]']
        rank = XmlRank(req)
        bm = []
        func = getattr(importlib.import_module("src.rank"), config.weighting)

        for q in query:
            req = clean_text(" ".join(q[1])).split()
            r = rank.getBm25(request=req,
                             func=func,
                             k=config.k,
                             b=config.b,
                             d=1)
            bm.append((q, r[:config.limit]))
        return bm

    def rank_article(self, query):
        config = self.conf.getConfig()
        config = namedtuple("Conf",
                            config['run'].keys())(*config['run'].values())
        articles = self.__articles(query)
        for i in articles:
            run = Run("".join(config.staff), config.step, config.num,
                      config.weighting, config.granularity,
                      ['k' + str(config.k), 'b' + str(config.b)])
            file = run.createRun("../runs", i[1], i[0][0])

        self.conf.incrementRun()

    def rank_elem(self, query):
        config = self.conf.getConfig()
        config = namedtuple("Conf",
                            config['run'].keys())(*config['run'].values())
        docsEl = dict()
        for (k, v) in self.docs.items():
            for (w, x) in v.items():
                key = (k + w)
                docsEl[key] = x

        rank = XmlRank(docsEl)
        bm = []
        func = getattr(importlib.import_module("src.rank"), config.weighting)
        for q in query:
            req = clean_text(" ".join(q[1])).split()
            r = rank.getBm25(request=req,
                             func=func,
                             k=config.k,
                             b=config.b,
                             d=0)
            red = dict()
            fres = []
            lid = 0
            for t in r:
                if (len(fres) == config.limit):
                    break
                id, res = t[0].split("/", 1)
                if red.get(id) is None:
                    red[id] = [res]
                    fres.append((id, res, t[1]))
                    lid = id
                else:
                    rec = False
                    for z in red[id]:
                        if res in z or z in res:
                            rec = True
                    if not rec and id == lid:
                        red[id].append(res)
                        fres.append((id, res, t[1]))
                        lid = id

            run = Run("".join(config.staff), config.step, config.num,
                      config.weighting, config.granularity,
                      ['k' + str(config.k), 'b' + str(config.b)])
            file = run.createRunElem("../runs", fres, q[0])

        self.conf.incrementRun()