if self.parse_data["bio"]: self.bio = self.parse_data["bio"] if len(bio_rule): self.bio = extract(bio_rule, self.sec).xpath('string(.)') def _generate_keywords(self): if "keywords" in self.parse_data.keys(): if self.parse_data["keywords"]: self.keywords.append(self.parse_data["keywords"]) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1,len(self.keywords)+1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1,len(self.city)+1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)] if __name__ == '__main__': CIVILNyuTask = CommonTask(website_name=CIVILNyuClass.__name__, custom_parser=CIVILNyuClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule, default_url="http://engineering.nyu.edu", is_url_joint=True ) CIVILNyuTask.run() print("count:", CIVILNyuTask.count)
else: self.email = extract(email_rule, self.sec).replace('mailto:', '') def _generate_website(self): if "website" in self.parse_data.keys(): if self.parse_data["website"]: regex = '"(.*?)"' res = re.search(regex, str(self.parse_data["website"])) self.website = res.group() else: self.website = "http://www.mccormick.northwestern.edu/mechanical/people/faculty/" def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1, len(self.city) + 1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)] if __name__ == '__main__': MccormickTask = CommonTask(website_name=MccormickClass.__name__, custom_parser=MccormickClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule) MccormickTask.run()
if self.parse_data["email"]: self.email = self.parse_data["email"] def _generate_website(self): if "website" in self.parse_data.keys(): if self.parse_data["website"]: regex = '"(.*?)"' res = re.search(regex, str(self.parse_data["website"])) self.website = res.group() else: self.website = "http://www.me.umn.edu/people/index.shtml" def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1, len(self.city) + 1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)] if __name__ == '__main__': MEUmnTask = CommonTask(website_name=MEUmnClass.__name__, custom_parser=MEUmnClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule) MEUmnTask.run()
def _generate_website(self): if "website" in self.parse_data.keys(): if self.parse_data["website"]: regex = '"(.*?)"' res = re.search(regex, str(self.parse_data["website"])) self.website = res.group() else: self.website = extract(website_rule, self.sec) def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1, len(self.city) + 1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)] if __name__ == '__main__': ECEUcsbTask = CommonTask( website_name=ECEUcsbClass.__name__, custom_parser=ECEUcsbClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule, #="http://www.cs.ucsb.edu", is_url_joint=False) ECEUcsbTask.run()
def _generate_keywords(self): if "keywords" in self.parse_data.keys(): if self.parse_data["keywords"]: self.keywords.append(self.parse_data["keywords"]) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1,len(self.keywords)+1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1,len(self.city)+1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)] if __name__ == '__main__': from SampleData.cse_nd import base_url,sample_url,data,item_url_rule CSENdTask = CommonTask(website_name=CSENdClass.__name__, custom_parser=CSENdClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule ) CSENdTask.run() print(CSENdTask.count) # from SampleData.ame_nd import * # from utils.connection import extract, fetch # html = fetch("https://engineering.nd.edu/profiles/kchristensen") # a = extract(phone_rule, html).xpath('string(.)').strip().replace('Phone:','') # print(a)
if self.parse_data["keywords"]: self.keywords.append(self.parse_data["keywords"]) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1, len(self.city) + 1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)] if __name__ == '__main__': UcsdTask = CommonTask( website_name=UcsdClass.__name__, custom_parser=UcsdClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule, default_url="http://jacobsschool.ucsd.edu/faculty/faculty_bios/", is_url_joint=True) UcsdTask.run() print("count:", UcsdTask.count)
self.bio = self.parse_data["bio"] if bio_rule: self.bio = extract(bio_rule, self.sec) def _generate_keywords(self): if "keywords" in self.parse_data.keys(): if self.parse_data["keywords"]: self.keywords.append(self.parse_data["keywords"]) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1,len(self.keywords)+1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1,len(self.city)+1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)] if __name__ == '__main__': from SampleData.ame_nd import base_url,sample_url,data,item_url_rule from CustomParser.ame_nd_parser import AmeNdClass AmeNdTask = CommonTask(website_name=AmeNdClass.__name__, custom_parser=AmeNdClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule ) AmeNdTask.run() print(AmeNdTask.count)
if self.parse_data["bio"]: self.bio = self.parse_data["bio"] if bio_rule: self.bio = extract(bio_rule, self.sec) def _generate_keywords(self): if "keywords" in self.parse_data.keys(): if self.parse_data["keywords"]: self.keywords.append(self.parse_data["keywords"]) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1,len(self.keywords)+1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1,len(self.city)+1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)] if __name__ == '__main__': from SampleData.ee_nd import base_url,sample_url,data,item_url_rule EENdTask = CommonTask(website_name=EENdClass.__name__, custom_parser=EENdClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule ) EENdTask.run() print(EENdTask.count)
def _generate_website(self): if "website" in self.parse_data.keys(): if self.parse_data["website"]: regex = '"(.*?)"' res = re.search(regex, str(self.parse_data["website"])) self.website = res.group() else: self.website = extract(website_rule, self.sec) def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1, len(self.city) + 1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)] if __name__ == '__main__': CYBERUmdTask = CommonTask(website_name=CYBERUmdClass.__name__, custom_parser=CYBERUmdClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule, default_url="http://www.cyber.umd.edu", is_url_joint=True) CYBERUmdTask.run()
self.bio = self.parse_data["bio"] if bio_rule: self.bio = extract(bio_rule, self.sec) def _generate_keywords(self): if "keywords" in self.parse_data.keys(): if self.parse_data["keywords"]: self.keywords.append(self.parse_data["keywords"]) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1,len(self.keywords)+1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1,len(self.city)+1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)] if __name__ == '__main__': from SampleData.me_udel import base_url,sample_url,data,item_url_rule MEUdelTask = CommonTask(website_name=MEUdelClass.__name__, custom_parser=MEUdelClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule, default_url="http://www.me.udel.edu/people/", is_url_joint=True ) MEUdelTask.run() print("count:",MEUdelTask.count)
if "keywords" in self.parse_data.keys(): if self.parse_data["keywords"]: self.keywords.append(self.parse_data["keywords"]) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1, len(self.city) + 1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)] if __name__ == '__main__': CEUdelTask = CommonTask(website_name=CEUdelClass.__name__, custom_parser=CEUdelClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule, default_url="http://www.ce.udel.edu/directories/", is_url_joint=True) CEUdelTask.run()
if "keywords" in self.parse_data.keys(): if self.parse_data["keywords"]: self.keywords.append(self.parse_data["keywords"]) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1, len(self.keywords) + 1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1, len(self.city) + 1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1, len(self.timeKeys) + 1)] if __name__ == '__main__': BIONyuTask = CommonTask(website_name=BIONyuClass.__name__, custom_parser=BIONyuClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule, default_url="http://engineering.nyu.edu/", is_url_joint=True) BIONyuTask.run()
self.bio = self.parse_data["bio"] if bio_rule: self.bio = extract(bio_rule, self.sec) def _generate_keywords(self): if "keywords" in self.parse_data.keys(): if self.parse_data["keywords"]: self.keywords.append(self.parse_data["keywords"]) def _generate_city(self): pass def _generate_time(self): pass def _generate_keywordKeys(self): self.keywordKeys = [i for i in range(1,len(self.keywords)+1)] def _generate_cityKeys(self): self.cityKeys = [i for i in range(1,len(self.city)+1)] def _generate_timeKeys(self): self.timeKeys = [i for i in range(1,len(self.timeKeys)+1)] if __name__ == '__main__': from SampleData.cbe_udel import base_url,sample_url,data,item_url_rule CBETask = CommonTask(website_name=CBEUdelClass.__name__, custom_parser=CBEUdelClass, base_url=base_url, sample_url=sample_url, data=data, item_url_rule=item_url_rule, default_url="http://www.cbe.udel.edu/directory/", is_url_joint=True ) CBETask.run() print(CBETask.count)