def create(d): try: d = json.loads(d) if d["_id"] in done: print "Ignore" return from srmse import db db = db.getMongo() db = db["cron-dbpedia"] dic = {} dic["_id"] = d["_id"].replace( "http://dbpedia#dot#org/resource/", "http://www.wikipedia.org/wiki/").replace("#dot#", ".") dic["body"] = d["summary"] dic["search"] = d["search"] dic["box"] = fetchInfoBox(d["_id"], db) dic["external_links"] = fetchExternalLinks(d["_id"], db) dic["page_views"] = fetchPageViews(d["_id"], db) dic["home_page"] = fetchHomePage(d["_id"], db) dic["image"] = fetchImage(d["_id"], db) k = getCategories(d["_id"], db) if k[0]: dic["categories"] = k[1] dic["categories_search_fields"] = k[2] else: dic["categories"] = [] dic["categories_search_fields"] = {} dd = isDisambiguate(d["_id"], db) dic["isDisambiguation"] = dd[0] dic["resolvedTo"] = dd[1] if dd[0]: dic["bestResolved"] = getBestResolved(d["_id"], dd[1], db) else: dic["bestResolved"] = None ff = getRedirects(d["_id"], db) if ff[0]: dic["hasRedirects"] = True dic["redirects"] = ff[1] dic["redirects_search_fields"] = ff[2] else: dic["hasRedirects"] = False index(dic, d["_id"], db) except Exception as e: print e
def create(d): try: d=json.loads(d) if d["_id"] in done: print "Ignore" return from srmse import db db=db.getMongo() db=db["cron-dbpedia"] dic={} dic["_id"]=d["_id"].replace("http://dbpedia#dot#org/resource/","http://www.wikipedia.org/wiki/").replace("#dot#",".") dic["body"]=d["summary"] dic["search"]=d["search"] dic["box"]=fetchInfoBox(d["_id"],db) dic["external_links"]=fetchExternalLinks(d["_id"],db) dic["page_views"]=fetchPageViews(d["_id"],db) dic["home_page"]=fetchHomePage(d["_id"],db) dic["image"]=fetchImage(d["_id"],db) k=getCategories(d["_id"],db) if k[0]: dic["categories"]=k[1] dic["categories_search_fields"]=k[2] else: dic["categories"]=[] dic["categories_search_fields"]={} dd=isDisambiguate(d["_id"],db) dic["isDisambiguation"]=dd[0] dic["resolvedTo"]=dd[1] if dd[0]: dic["bestResolved"]=getBestResolved(d["_id"],dd[1],db) else: dic["bestResolved"]=None ff=getRedirects(d["_id"],db) if ff[0]: dic["hasRedirects"]=True dic["redirects"]=ff[1] dic["redirects_search_fields"]=ff[2] else: dic["hasRedirects"]=False index(dic,d["_id"],db) except Exception as e: print e
import inject,sys,json import gzip import requests import config config = config.getConfig() import threading from srmse import nutch from srmse import db db = db.getMongo(False) db = db["common-crawl"] try: from cStringIO import StringIO except: from StringIO import StringIO def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i+n] def getDict(url,url_data): """ dump='/common-crawl/parse-output/segment/{arcSourceSegmentId}/{arcFileDate}_{arcFilePartition}.arc.gz' """ dic=url_data.replace(url_data.split(" {")[0],"") js=eval(dic.replace(" ","").replace("\n","")) return js def fetchDomain(domain):
from multiprocessing import Pool import json from srmse import db es=db.getES() import threading import string import re,os,sys files=os.listdir(".") done=[] db = db.getMongo() db = db["cron-dbpedia"] c=db.wiki_done.find({"done":{"$exists":True}}) for cc in c: done.append(cc["done"]) print "previous done ",c.count() if not "result.json" in files: #dump the collection to get seeds f=db["short-abstracts_en"].find({}) for ff in f: open("result.json","a").write(json.dumps(ff)+"#SPLIT#\n#SPLIT#") print "Dumped article list" def index(d,i,db): idd=d["_id"] del d["_id"] try: es.index(index="wiki_module1",id=idd,doc_type="doc",body=d) print "Indexed" db.wiki_done.insert({"done":i},w=0) except: d["_id"]=idd
import urllib2 as u from bs4 import BeautifulSoup as bs from srmse import db import json mongo=db.getMongo() dbpedia=mongo["cron-dbpedia"] def dictify(ul): result = {} for li in ul.find_all("li", recursive=False): key = next(li.stripped_strings) ul = li.find("ul") if ul: result[key] = dictify(ul) else: result[key] = None return result #html=u.urlopen("http://mappings.dbpedia.org/server/ontology/classes/").read() #soup=bs(html) #ul = soup.body.ul #from pprint import pprint #pprint(dictify(ul), width=1) dic=eval(open("db.json","r").read()) dbpedia.classes.insert(dic)
from multiprocessing import Pool import json from srmse import db es = db.getES() import threading import string import re, os, sys files = os.listdir(".") done = [] db = db.getMongo() db = db["cron-dbpedia"] c = db.wiki_done.find({"done": {"$exists": True}}) for cc in c: done.append(cc["done"]) print "previous done ", c.count() if not "result.json" in files: #dump the collection to get seeds f = db["short-abstracts_en"].find({}) for ff in f: open("result.json", "a").write(json.dumps(ff) + "#SPLIT#\n#SPLIT#") print "Dumped article list" def index(d, i, db): idd = d["_id"] del d["_id"] try: es.index(index="wiki_module1", id=idd, doc_type="doc", body=d) print "Indexed" db.wiki_done.insert({"done": i}, w=0) except:
import urllib2 as u from bs4 import BeautifulSoup as bs from srmse import db import json mongo = db.getMongo() dbpedia = mongo["cron-dbpedia"] def dictify(ul): result = {} for li in ul.find_all("li", recursive=False): key = next(li.stripped_strings) ul = li.find("ul") if ul: result[key] = dictify(ul) else: result[key] = None return result #html=u.urlopen("http://mappings.dbpedia.org/server/ontology/classes/").read() #soup=bs(html) #ul = soup.body.ul #from pprint import pprint #pprint(dictify(ul), width=1) dic = eval(open("db.json", "r").read()) dbpedia.classes.insert(dic)