def generateFeatures(self): ''' Has been hardcoded for wikipedia For each category, fetch Wiki-pages from list.txt Store keywords (links in the specified section)in features.txt ''' e = Extractor() print self.categories for name in self.categories: print name f = open("%s/%s/%s" % (self.config.get(self.section, "CLASSES_FILE"), name, self.config.get(self.section, "LIST_FILE")), "r") g = open("%s/%s/%s" % (self.config.get(self.section, "CLASSES_FILE"), name, self.config.get(self.section, "FEATURE_FILE")), "w") for page in f: print page pagetok = page.strip().split('\t') try: section = pagetok[1] except: section = 0 links = e.getWikiLinks(pagetok[0], section = section) for feature in links: units = set(self.clean(feature).split('_')) for unit in units: unit = self.stemmer.stem(unit) if self.valid(unit): g.write("%s," % unit) g.write("\n") f.close() g.close()
def spider(self, root, pages = True, subcategories = True, action = "traverse", preclean = False, depth = 1): if preclean: self.graphdb.clear() seen_key = "URL_SEEN" queue_key = "URL_QUEUE" ex = Extractor() batch = neo4j.WriteBatch(self.graphdb) queue_empty = lambda: self.fdb.scard(queue_key) == 0 seen = lambda x: self.fdb.sismember(seen_key, x) visit = lambda x: self.fdb.sadd(seen_key, x) dequeue = lambda: self.fdb.spop(queue_key) enqueue = lambda x: self.fdb.sadd(queue_key, self._encode_str(x)) if action == "traverse": enqueue(root) while not queue_empty(): current = dequeue() print current if current and current.strip() and not seen(current): visit(current) result = ex.getAllFromCategory(current) self.updateBatch(batch, type = neo4j.Node, node = {'name': current, 'class': self.CATEGORY}) if pages: for page in result['pages']: print "{0}\tp:{1}".format(current[:15], page) self.incr_rel(page, current, self.CATEGORY_REL) self.updateBatch(batch, type = neo4j.Node, node = {'name': page, 'class': self.ARTICLE}) links = ex.getWikiLinks(page) for a in links: print "{0}\tp:{1}\t{2}".format(current[:15], page, a) self.incr_rel(a, page, self.SIBLING_REL) self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.ARTICLE}) if subcategories: for subcat in result['categories']: print "{0}\tc:{1}".format(current, subcat) self.incr_rel(subcat, current, self.SUBCAT_REL) self.updateBatch(batch, type = neo4j.Node, node = {'name': subcat, 'class': self.CATEGORY}) enqueue(subcat) elif action == "crawl": enqueue(root) while not queue_empty(): topic = dequeue() if topic and topic.strip() and not seen(topic): visit(topic) result = ex.extract(topic) depth -= 1 self.updateBatch(batch, type = neo4j.Node, node = {'name': topic, 'class': result['type']}) if result['type'] == self.CATEGORY: pass elif result['type'] == self.ARTICLE: for a in result['links']: self.incr_rel(a, topic, self.SIBLING_REL) print "adding: ", a self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.ARTICLE}) if depth > 0: enqueue(a) for c in result['categories']: self.incr_rel(a, topic, self.CATEGORY_REL) self.updateBatch(batch, type = neo4j.Node, node = {'name': c, 'class': self.CATEGORY}) elif result['type'] == self.DISAMBIGUATION: for a in result['links']: self.incr_rel(a, topic, self.DISAMB_REL) self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.DISAMBIGUATION}) print "FINISHED WITH THE NODES..." for k in self.fdb.smembers(self.rel_key): print "REL:", k try: nodes = k.split(":", 2) rel = nodes[0] n1 = self.node_index.get('name', nodes[1])[0] n2 = self.node_index.get('name', nodes[2])[0] self.updateBatch(batch, type = neo4j.Relationship, rel = {'node1': n1, 'rel': rel, 'weight': 1, 'node2': n2}) except Exception as e: print "REL EXCEPTION: ", e print "DONE>>>>>>>>>>>>>>>"