def build_index(self): ''' This function build the inverted index, it inserts the url to the doc Table with a doc_id, and insert each token to tokenT table and insert token, doc_id, term frequency and weight into the web_index Table ''' c = Corpus() t = Tokenizer() for url, name in c.get_file_name(): if len(url) > 1000: continue result = t.tokenize(name) if len(result) == 0: continue print(url) doc_id = 1 #Insert URL to table DOC sql = "INSERT INTO web.doc(url) values (%s)" val = (url, ) self.mycursor.execute(sql, val) self.mydb.commit() print(self.mycursor.rowcount, "was inserted in URL.") print(url) s_sql = "select id from doc where url=%s" self.mycursor.execute(s_sql, val) myresult = self.mycursor.fetchone() doc_id = myresult[0] print("DOC_ID IS " + str(doc_id)) #Insert token, doc_id, tf into web_index t_sql = "INSERT INTO web.web_index(token, doc_id, tf, wt) values (%s,%s,%s,%s)" t_val = [] for token in result.keys(): t_val.append( (token, doc_id, result[token][0], result[token][1])) #print(t_val) self.mycursor.executemany(t_sql, t_val) self.mydb.commit() print(self.mycursor.rowcount, "was inserted in WEB_INDEX.") #insert into TokenT table count = 0 for token in result.keys(): tq = "Insert ignore into tokenT values (%s)" tv = (token, ) self.mycursor.execute(tq, tv) self.mydb.commit() count += 1 print("inserted " + str(count) + " Tokens")
class Graph: def __init__(self): self.corpus = Corpus() self.Vertices = [] self.total_vertex = 0 self.total_edge = 0 self.score = defaultdict(lambda: 1) def load_vertex(self): for url, adr in self.corpus.get_file_name(): target = self.find(url) if target == None: self.add_vertex(url) target = self.find(url) with open(adr, "rb") as file: content = file.read() htmlElem = html.fromstring(content) links = htmlElem.xpath('//a/@href') outputLinks = [] for link in links: outputLinks.append(urljoin(url, link)) for i in outputLinks: self.add_edge(Edge(url, i)) def print_score(self): count = 0 for i in self.score: if count == 100: break print("url: ", i, "\n", "score:", self.score[i]) count += 1 def add_vertex(self, url): self.Vertices.append(Vertex(url)) self.total_vertex += 1 print("# of Vertex added: ", self.total_vertex, "url: ", url) def add_edge(self, edge): s = edge.src d = edge.dst target = self.find(edge.dst) if target == None: self.add_vertex(edge.dst) target = self.find(edge.dst) if not self.Vertices[target].contain(edge): self.Vertices[target].add_in_edge(edge) target2 = self.find(edge.src) if target2 == None: self.add_vertex(edge.src) target2 = self.find(edge.src) if not self.Vertices[target2].contain_out(edge): self.Vertices[target2].add_out_edge(edge) def find(self, id): for i in range(len(self.Vertices)): if self.Vertices[i].id == id: return i return None def page_rank(self, d=0.85, max_iteration=2): for n in range(max_iteration): new_score = defaultdict(int) for i in range(self.total_vertex): new_score[self.Vertices[i].id] = 1 - d for x in range(self.Vertices[i].num_in_edges): new_score[self.Vertices[i].id] += d * ( self.score[self.Vertices[i].in_edges[x].src] / self.Vertices[self.find( self.Vertices[i].in_edges[x].src)].num_out_edges) print("url: ", self.Vertices[i].id, "\n", "score:", new_score[self.Vertices[i].id]) self.score = new_score