Ejemplo n.º 1
0
def create(d):
    try:
        d = json.loads(d)
        if d["_id"] in done:
            print "Ignore"
            return
        from srmse import db
        db = db.getMongo()
        db = db["cron-dbpedia"]
        dic = {}
        dic["_id"] = d["_id"].replace(
            "http://dbpedia#dot#org/resource/",
            "http://www.wikipedia.org/wiki/").replace("#dot#", ".")
        dic["body"] = d["summary"]
        dic["search"] = d["search"]
        dic["box"] = fetchInfoBox(d["_id"], db)
        dic["external_links"] = fetchExternalLinks(d["_id"], db)
        dic["page_views"] = fetchPageViews(d["_id"], db)
        dic["home_page"] = fetchHomePage(d["_id"], db)
        dic["image"] = fetchImage(d["_id"], db)
        k = getCategories(d["_id"], db)
        if k[0]:
            dic["categories"] = k[1]
            dic["categories_search_fields"] = k[2]
        else:
            dic["categories"] = []
            dic["categories_search_fields"] = {}

        dd = isDisambiguate(d["_id"], db)
        dic["isDisambiguation"] = dd[0]
        dic["resolvedTo"] = dd[1]
        if dd[0]:
            dic["bestResolved"] = getBestResolved(d["_id"], dd[1], db)
        else:
            dic["bestResolved"] = None
        ff = getRedirects(d["_id"], db)
        if ff[0]:
            dic["hasRedirects"] = True
            dic["redirects"] = ff[1]
            dic["redirects_search_fields"] = ff[2]
        else:
            dic["hasRedirects"] = False
        index(dic, d["_id"], db)
    except Exception as e:
        print e
Ejemplo n.º 2
0
def create(d):
	try:
		d=json.loads(d)
		if d["_id"] in done:
			print "Ignore"
			return
		from srmse import db
		db=db.getMongo()
		db=db["cron-dbpedia"]
		dic={}
		dic["_id"]=d["_id"].replace("http://dbpedia#dot#org/resource/","http://www.wikipedia.org/wiki/").replace("#dot#",".")
		dic["body"]=d["summary"]
		dic["search"]=d["search"]
		dic["box"]=fetchInfoBox(d["_id"],db)
		dic["external_links"]=fetchExternalLinks(d["_id"],db)
		dic["page_views"]=fetchPageViews(d["_id"],db)
		dic["home_page"]=fetchHomePage(d["_id"],db)
		dic["image"]=fetchImage(d["_id"],db)
		k=getCategories(d["_id"],db)
		if k[0]:
			dic["categories"]=k[1]
			dic["categories_search_fields"]=k[2]
		else:
			dic["categories"]=[]
			dic["categories_search_fields"]={}
	
		dd=isDisambiguate(d["_id"],db)
		dic["isDisambiguation"]=dd[0]
		dic["resolvedTo"]=dd[1]
		if dd[0]:
			dic["bestResolved"]=getBestResolved(d["_id"],dd[1],db)
		else:
			dic["bestResolved"]=None
		ff=getRedirects(d["_id"],db)
		if ff[0]:
			dic["hasRedirects"]=True
			dic["redirects"]=ff[1]
			dic["redirects_search_fields"]=ff[2]
		else:
			dic["hasRedirects"]=False	
		index(dic,d["_id"],db)
	except Exception as e:
		print e
Ejemplo n.º 3
0
import inject,sys,json
import gzip
import requests
import config
config = config.getConfig()
import threading
from srmse import nutch
from srmse import db
db = db.getMongo(False)

db = db["common-crawl"]
try:
    from cStringIO import StringIO
except:
    from StringIO import StringIO
    
def chunks(l, n):
	"""Yield successive n-sized chunks from l."""
	for i in range(0, len(l), n):
		yield l[i:i+n]
      			
def getDict(url,url_data):
	"""
		dump='/common-crawl/parse-output/segment/{arcSourceSegmentId}/{arcFileDate}_{arcFilePartition}.arc.gz'
	"""
	dic=url_data.replace(url_data.split(" {")[0],"")
	js=eval(dic.replace(" ","").replace("\n",""))
	return js


def fetchDomain(domain):
Ejemplo n.º 4
0
from multiprocessing import Pool
import json
from srmse import db
es=db.getES()
import threading
import string
import re,os,sys
files=os.listdir(".")
done=[]
db = db.getMongo()
db = db["cron-dbpedia"]
c=db.wiki_done.find({"done":{"$exists":True}})
for cc in c:
	done.append(cc["done"])
print "previous done ",c.count()
if not "result.json" in files:
	#dump the collection to get seeds
	f=db["short-abstracts_en"].find({})
	for ff in f:
		open("result.json","a").write(json.dumps(ff)+"#SPLIT#\n#SPLIT#")
	print "Dumped article list"
	
def index(d,i,db):
	idd=d["_id"]
	del d["_id"]
	try:
		es.index(index="wiki_module1",id=idd,doc_type="doc",body=d)
		print "Indexed"
		db.wiki_done.insert({"done":i},w=0)
	except:
		d["_id"]=idd
Ejemplo n.º 5
0
import urllib2 as u
from bs4 import BeautifulSoup as bs
from srmse import db
import json
mongo=db.getMongo()
dbpedia=mongo["cron-dbpedia"]
def dictify(ul):
    result = {}
    for li in ul.find_all("li", recursive=False):
        key = next(li.stripped_strings)
        ul = li.find("ul")
        if ul:
            result[key] = dictify(ul)
        else:
            result[key] = None
    return result


#html=u.urlopen("http://mappings.dbpedia.org/server/ontology/classes/").read()
#soup=bs(html)
#ul = soup.body.ul
#from pprint import pprint
#pprint(dictify(ul), width=1)
dic=eval(open("db.json","r").read())
dbpedia.classes.insert(dic)
Ejemplo n.º 6
0
from multiprocessing import Pool
import json
from srmse import db
es = db.getES()
import threading
import string
import re, os, sys
files = os.listdir(".")
done = []
db = db.getMongo()
db = db["cron-dbpedia"]
c = db.wiki_done.find({"done": {"$exists": True}})
for cc in c:
    done.append(cc["done"])
print "previous done ", c.count()
if not "result.json" in files:
    #dump the collection to get seeds
    f = db["short-abstracts_en"].find({})
    for ff in f:
        open("result.json", "a").write(json.dumps(ff) + "#SPLIT#\n#SPLIT#")
    print "Dumped article list"


def index(d, i, db):
    idd = d["_id"]
    del d["_id"]
    try:
        es.index(index="wiki_module1", id=idd, doc_type="doc", body=d)
        print "Indexed"
        db.wiki_done.insert({"done": i}, w=0)
    except:
Ejemplo n.º 7
0
import urllib2 as u
from bs4 import BeautifulSoup as bs
from srmse import db
import json
mongo = db.getMongo()
dbpedia = mongo["cron-dbpedia"]


def dictify(ul):
    result = {}
    for li in ul.find_all("li", recursive=False):
        key = next(li.stripped_strings)
        ul = li.find("ul")
        if ul:
            result[key] = dictify(ul)
        else:
            result[key] = None
    return result


#html=u.urlopen("http://mappings.dbpedia.org/server/ontology/classes/").read()
#soup=bs(html)
#ul = soup.body.ul
#from pprint import pprint
#pprint(dictify(ul), width=1)
dic = eval(open("db.json", "r").read())
dbpedia.classes.insert(dic)