def __init__(self,files,silent = False): self.f0 = files[0] self.f1 = files[1] self.silent = silent f = open("data_id_post.csv","r") self.id_loader = Map_ID_Loader(f, True) self.id_dict = self.id_loader.getDict() f.close()
class CityGraph_header(object): def __init__(self,files,silent = False): self.f0 = files[0] self.f1 = files[1] self.silent = silent f = open("data_id_post.csv","r") self.id_loader = Map_ID_Loader(f, True) self.id_dict = self.id_loader.getDict() f.close() def process(self): pdoc = re.compile(r'<doc>.*?</doc>', re.S) docs = pdoc.findall(self.f0.read()) pcname = re.compile(r'<cname>.*?</cname>', re.S) phead = re.compile(r'<head>.*?</head>', re.S) pcity = re.compile(r'<NE:CITY.*?>.*?</NE:CITY>', re.S) i = 0 for doc in docs: i += 1 client = "" strcname = pcname.findall(doc)[0] strhead = phead.findall(doc)[0] cities = pcity.findall(strcname) if cities: t = cities[0] client = t[t.find('>')+1:t.find('<',t.find('<')+1)] citycount = {} cities = pcity.findall(strhead) for items in cities: name = items[items.find('>')+1:items.find('<',items.find('<')+1)] if citycount.has_key(name): citycount[name] += 1 else: citycount[name] = 1 if not client and citycount: client = self.find_most(citycount) self.output(client, citycount, i) def output(self, client, citycount, ith): self.f1.write("--------------------------------------\n") self.f1.write("header doc %5d\n" %(ith)) if not client: client = "N/A" self.f1.write("client city is %20s\n" %(client)) for k in citycount.keys(): self.f1.write("%20s %5s\n" %(k, citycount[k])) def find_most(self,citycount): max = 0 result = '' for k in citycount.keys(): if citycount[k] > max: max = citycount[k] result = k return result
class Header_Matcher(object): def __init__(self,files,silent = False): self.f1 = files[0] self.f2 = files[1] self.f3 = files[2] self.silent = silent f = open("data_id_post.csv","r") self.id_loader = Map_ID_Loader(f, True) self.id_dict = self.id_loader.getDict() f.close() def process(self): self.make_annotated_dict() print len(self.annotated_dict) self.go_through_header_raw() print self.num def go_through_header_raw(self): s, fid = self.extractDoc(self.f1) self.num = 0 while s != "": if self.annotated_dict.has_key(fid): self.f3.write(s) self.num += 1 s, fid = self.extractDoc(self.f1) def make_annotated_dict(self): self.annotated_dict = {} s, fid = self.extractDoc(self.f2) while s != "": self.annotated_dict[fid] = 1 s, fid = self.extractDoc(self.f2) def extractDoc(self, f): #extract doc string from input file docly c = f.readline() s = "" fid = "" while c and not "<doc>" in c: c = f.readline() if c: c = f.readline() tmp = c.replace('<','').replace('>','') tmp = tmp.split()[0][:-1] # <http:url\>: if self.id_dict.has_key(tmp): fid = self.id_dict[tmp] while not "</doc>" in c: s += c c = f.readline() return s, fid
class CityGraph(object): def __init__(self,files,silent = False): self.f0 = files[0] self.f1 = files[1] self.silent = silent f = open("data_id_post.csv","r") self.id_loader = Map_ID_Loader(f, True) self.id_dict = self.id_loader.getDict() f.close() def init(self): #build city count list on a doc base self.docNum = 0 self.doc_dict = {} s, fid = self.extractDoc() while s != "": if fid != "": self.doc_dict[fid] = {} self.update_dict(s, fid) s, fid = self.extractDoc() def update_dict(self, docstr, fid): #build the city count list for each doc p = re.compile(r'<NE:CITY.*?>.*?</NE:CITY>') list = p.findall(docstr) for items in list: name = items[items.find('>')+1:items.find('<',items.find('<')+1)] name = self.unify(name) if self.doc_dict[fid].has_key(name): self.doc_dict[fid][name] += 1 else: self.doc_dict[fid][name] = 1 def unify(self, city): #unify a city name into a standard way tmp = city.lower().replace(".","").replace(",","") tmp = ' '.join(tmp.split()) return tmp def out_city_count_list_comma(self): #write the city count list to output file with tab as separator if not self.silent: #self.f1.write("%20s %20s %5s\n" %("CITY NAME", "DOC", "COUNT")) for doc_id in self.doc_dict: if self.doc_dict[doc_id]: for each_city in self.doc_dict[doc_id]: self.f1.write("%s,%s,%.0f\n" %(each_city, doc_id, self.doc_dict[doc_id][each_city])) def out_city_count_list(self): #write the city count list to output file if not self.silent: self.f1.write("%20s %20s %5s\n" %("CITY NAME", "DOC", "COUNT")) for doc_id in self.doc_dict: if self.doc_dict[doc_id]: for each_city in self.doc_dict[doc_id]: self.f1.write("%20s %20s %5.0f\n" %(each_city, doc_id, self.doc_dict[doc_id][each_city])) def extractDoc(self): #extract doc string from input file docly c = self.f0.readline() s = "" fid = "" while c and not "<doc>" in c: c = self.f0.readline() if c: c = self.f0.readline() url = c.split()[0][:-1] if self.id_dict.has_key(url): fid = self.id_dict[url] # all http url is directly followed <doc> and has a extra ':' at the end while not "</doc>" in c: s += c c = self.f0.readline() self.docNum += 1 if self.docNum % 1000 == 0: print self.docNum return s, fid