예제 #1
0
    def build_index(self):
        '''
        This function build the inverted index, it inserts the url to the
        doc Table with a doc_id, and insert each token to tokenT table
        and insert token, doc_id, term frequency and weight into the web_index
        Table
        '''

        c = Corpus()
        t = Tokenizer()

        for url, name in c.get_file_name():
            if len(url) > 1000:
                continue
            result = t.tokenize(name)
            if len(result) == 0:
                continue
            print(url)
            doc_id = 1

            #Insert URL to table DOC
            sql = "INSERT INTO web.doc(url) values (%s)"
            val = (url, )
            self.mycursor.execute(sql, val)
            self.mydb.commit()

            print(self.mycursor.rowcount, "was inserted in URL.")

            print(url)
            s_sql = "select id from doc where url=%s"
            self.mycursor.execute(s_sql, val)
            myresult = self.mycursor.fetchone()
            doc_id = myresult[0]
            print("DOC_ID IS " + str(doc_id))

            #Insert token, doc_id, tf into web_index
            t_sql = "INSERT INTO web.web_index(token, doc_id, tf, wt) values (%s,%s,%s,%s)"

            t_val = []
            for token in result.keys():
                t_val.append(
                    (token, doc_id, result[token][0], result[token][1]))

            #print(t_val)

            self.mycursor.executemany(t_sql, t_val)

            self.mydb.commit()
            print(self.mycursor.rowcount, "was inserted in WEB_INDEX.")

            #insert into TokenT table
            count = 0
            for token in result.keys():
                tq = "Insert ignore into tokenT values (%s)"
                tv = (token, )
                self.mycursor.execute(tq, tv)
                self.mydb.commit()
                count += 1

            print("inserted " + str(count) + " Tokens")
예제 #2
0
class Graph:
    def __init__(self):
        self.corpus = Corpus()
        self.Vertices = []
        self.total_vertex = 0
        self.total_edge = 0
        self.score = defaultdict(lambda: 1)

    def load_vertex(self):
        for url, adr in self.corpus.get_file_name():
            target = self.find(url)
            if target == None:
                self.add_vertex(url)
                target = self.find(url)
            with open(adr, "rb") as file:
                content = file.read()
            htmlElem = html.fromstring(content)
            links = htmlElem.xpath('//a/@href')
            outputLinks = []
            for link in links:
                outputLinks.append(urljoin(url, link))
            for i in outputLinks:
                self.add_edge(Edge(url, i))

    def print_score(self):
        count = 0
        for i in self.score:
            if count == 100:
                break
            print("url: ", i, "\n", "score:", self.score[i])
            count += 1

    def add_vertex(self, url):
        self.Vertices.append(Vertex(url))
        self.total_vertex += 1
        print("# of Vertex added: ", self.total_vertex, "url: ", url)

    def add_edge(self, edge):
        s = edge.src
        d = edge.dst
        target = self.find(edge.dst)
        if target == None:
            self.add_vertex(edge.dst)
            target = self.find(edge.dst)
        if not self.Vertices[target].contain(edge):
            self.Vertices[target].add_in_edge(edge)
        target2 = self.find(edge.src)
        if target2 == None:
            self.add_vertex(edge.src)
            target2 = self.find(edge.src)
        if not self.Vertices[target2].contain_out(edge):
            self.Vertices[target2].add_out_edge(edge)

    def find(self, id):
        for i in range(len(self.Vertices)):
            if self.Vertices[i].id == id:
                return i
        return None

    def page_rank(self, d=0.85, max_iteration=2):
        for n in range(max_iteration):
            new_score = defaultdict(int)
            for i in range(self.total_vertex):
                new_score[self.Vertices[i].id] = 1 - d
                for x in range(self.Vertices[i].num_in_edges):
                    new_score[self.Vertices[i].id] += d * (
                        self.score[self.Vertices[i].in_edges[x].src] /
                        self.Vertices[self.find(
                            self.Vertices[i].in_edges[x].src)].num_out_edges)
            print("url: ", self.Vertices[i].id, "\n", "score:",
                  new_score[self.Vertices[i].id])
            self.score = new_score