def get_contact(self, response: Response) -> (dict, dict): """ Gets the contact information. :param response: the response object :return: a tuple of two dict, one for an user and the other for the contact information """ contact = { 'email': '', 'phone': '', 'website': response.url, 'meet': '' } # manager name = response.xpath("//dd[@class='manager']/a/text()").get() link = response.xpath("//dd[@class='manager']/a/@href").get() manager = create_user() manager['name'] = name manager['ref'] = link tag = response.xpath("//dd[@class='manager']/div/em[1]/text()").get() if tag is not None and isinstance(tag, str): manager['tag'] = remove_empty_string_from_array(tag.split(', ')) contact['phone'] = response.xpath( "//dd[@class='manager']/div/em[2]/text()").get() manager['contact'] = contact manager['contact']['website'] = link self.log('find manager {} with contact {}'.format(manager, contact), level=logging.DEBUG) return manager, contact
def add_inventors(self, response: Response) -> list: """ Add inventors to the project. :param response: Response object :return a list of inventors """ inventors = [] for row in response.xpath("//div[contains(@class, 'node-inventor')]"): name = row.xpath("string(h2)").get() link = 'http://tlo.mit.edu' + row.xpath("h2//a/@href").get() department = row.xpath( "string(div[@class='content']/div[contains(@class, 'field-name-field-depa')])" ).get() title = row.xpath( "string(div[@class='content']/div[contains(@class, 'field-name-field-link-title')])" ).get() user = create_user() user['name'] = name user['exp']['exp']['company'] = self.name user['exp']['exp']['title'] = title user['ref'] = link user['contact']['website'] = link user['abs'] = department inventors.append(user) self.log('Found inventor {}'.format(user['name']), level=logging.DEBUG) return inventors
def get_contact_person(self, response: Response) -> dict: """ Add inventors to the project. :param response: Response object :return a list of inventors """ user = create_user() user['name'] = response.xpath("string(//*[@id='contact-person'])").get() user['exp']['exp']['company'] = self.name self.log('Found contact person {}'.format(user['name']), level=logging.DEBUG) return user
def upload_author(self, author_impact: dict) -> dict: """ Upload the unique authors to the database. :param author_impact: author citation data :return: a dictionary using the ref of author as key and its _id in database as value """ # find unique author author_dict = self.authors.all_elements() # upload the user to the server users = [] user_ids = {} for a in author_dict: user = create_user() user['name'] = a[0] user['abs'] = a[1] user['ref'] = a[1] user['contact']['email'] = normalize_email(a[1]) user['contact']['phone'] = normalize_phone(a[1]) user['exp']['impact'] = author_impact[a][ 'citation'] if a in author_impact else 0 user['exp']['impact'] = author_impact[a][ 'keyword'] if a in author_impact else [] user['onepage']['bg'] = json.dumps([u[1] for u in author_dict[a]]) address = self.parse_address(a[1]) if address is not None: user['addr'] = address else: user['addr']['city'] = 'Unknown' user['addr']['country'] = 'Unknown' users.append(user) if len(users) >= 1000: response = add_record('entity', users) if response['_status'] != 'OK': self.logger.error('fail to create user'.format(a)) else: for u, r in zip(users, response['_items']): user_ids[(u['name'], u['abs'])] = r['_id'] users = [] if len(users) > 0: response = add_record('entity', users) if response['_status'] != 'OK': self.logger.error('fail to create user'.format(a)) else: for u, r in zip(users, response['_items']): user_ids[(u['name'], u['abs'])] = r['_id'] del users pickle.dump(user_ids, open('pubmed_author_ids.cp', 'wb')) return user_ids
def _add_sponsor(data: dict) -> dict: """ Add a sponsor for the clinical trial :param data: a dictionary contains the sponsor data :return: the added sponsor as Project """ user = create_user() user['name'] = data['agency'] user['type'] = 32 if 'agency_class' in data: user['tag'] = data['agency_class'] return user
def add_inventors(self, response): inventors = [] for row in response.xpath("//dd[@class='inventor']"): name = row.xpath("a/text()").get() link = row.xpath("a/@href").get() abstract = row.xpath('string(div[1])').get() user = create_user() user['name'] = name user['abs'] = abstract user['ref'] = link user['contact']['website'] = link user['exp']['exp']['company'] = self.name inventors.append(user) self.log('Found inventor {}'.format(user['name']), level=logging.DEBUG) return inventors
def add_inventors(self, response: Response) -> list: """ Add inventors to the project. :param response: Response object :return a list of inventors """ inventors = [] for name in response.xpath( "//div[@class='inventors']/a/text()").getall(): user = create_user() user['name'] = name user['exp']['exp']['company'] = self.name inventors.append(user) self.log('Found inventor {}'.format(user['name']), level=logging.DEBUG) return inventors
def add_inventors(self, response: Response) -> list: """ Add inventors to the project. :param response: Response object :return a list of inventors """ inventors = [] for name in response.xpath("//div[@class='ncd-data inventors display-block indented']/ul/li/text()").getall(): if len(name) < 1: continue user = create_user() user['name'] = name user['exp']['exp']['company'] = self.name inventors.append(user) self.log('Found inventor {}'.format(user['name']), level=logging.DEBUG) return inventors
def get_contact(self, response: Response) -> dict: """ Get contact information of the project. :param response: Response object :return a list of inventors """ user = create_user() email = response.xpath( "//div[@class='tech-manager']/a/@href").get()[len('mailto:'):] name = response.xpath("//div[@class='tech-manager']/a/text()").get() user['name'] = name user['contact']['email'] = email if email is not None else '' phone = extract_phone( response.xpath("//div[@class='tech-manager']/text()").get()) if len(phone) > 0: user['contact']['phone'] = phone return user
def upload_user_to_server(file_name): data = pickle.load(open(file_name, 'rb')) # upload the user to the server users = [] for a in data: user = create_user() user['name'] = a[0] user['abs'] = a[1] user['ref'] = a[1] user['contact']['email'] = normalize_email(a[1]) user['contact']['phone'] = normalize_phone(a[1]) user['tag'] = data[a]['keyword'] user['onepage']['prod'] = data[a]['citation'] # try to parse the address addr = parse_us_address(a[1]) if addr is not None: user['addr'] = addr json.dump(users, open('pubmed_author.json', 'wb'))
def add_inventors(self, response: Response) -> list: """ Add inventors to the project. :param response: Response object :return a list of inventors """ inventors = [] for row in response.xpath( '//*[@id="formTechPub1"]/div/table/tr/td/table[1]/tr/td/a'): name = row.xpath("text()").get() if len(name) < 1: continue user = create_user() user['name'] = name user['exp']['exp']['company'] = self.name inventors.append(user) self.log('Found inventor {}'.format(user['name']), level=logging.DEBUG) return inventors
def add_inventors(self, response: Response) -> list: """ Add inventors to the project. :param response: Response object :return a list of inventors """ inventors = [] for row in response.xpath("//dd[@class='inventor']/a"): name = row.xpath("text()").get() link = row.xpath("@href").get() user = create_user() user['name'] = name user['ref'] = link user['contact']['website'] = link user['exp']['exp']['company'] = self.name inventors.append(user) self.log('Found inventor {}'.format(user['name']), level=logging.DEBUG) return inventors
def add_inventors(self, response: Response) -> list: """ Add inventors to the project. :param response: Response object :return a list of inventors """ inventors = [] for row in response.xpath("//div[@class='side-bucket invention-side-block']"): if not row.xpath("h4[@class='side-heading']/text()").get() == 'Investigators:': continue for name in row.xpath("ul/li/text()").getall(): if len(name) < 1: continue user = create_user() user['name'] = name user['exp']['exp']['company'] = self.name inventors.append(user) self.log('Found inventor {}'.format(user['name']), level=logging.DEBUG) return inventors
def get_contact(self, response: Response) -> list: """ Gets the contact information. :param response: the response object :return: a list of contact """ users = [] for row in response.xpath("//div[@class='associate-item']/div"): user = create_user() user['ref'] = response.urljoin(row.xpath("a/@href").get()) user['contact']['website'] = user['ref'] user['logo'] = response.urljoin(row.xpath("a/img/@src").get()) user['name'] = row.xpath("h4[@class='team-name']/a/text()").get() user['abs'] = row.xpath("strong[@class='team-position']/text()").get() user['exp']['exp']['title'] = user['abs'] user['exp']['exp']['company'] = self.name user['contact']['email'] = response.xpath("ul/li[@class='bottom-item bottom-email']/a/@href").get() user['contact']['phone'] = response.xpath("ul/li[@class='bottom-item bottom-phone']/a/text()").get() users.append(user) return users
def get_contact(self, response: Response) -> dict: """ Gets the contact information. :param response: the response object :return: the contact information """ user = create_user() user['name'] = response.xpath( "//div[@class='case-manager']/a/text()").get() user['ref'] = response.urljoin( response.xpath("//div[@class='case-manager']/a/@href").get()) user['contact']['website'] = user['ref'] user['contact']['email'] = response.xpath( "//div[@class='case-manager']/span/a/text()").get() if user['contact']['email'] is None: user['contact']['email'] = '' phone = extract_phone( response.xpath("string(//div[@class='case-manager'])").get()) if len(phone) > 0: user['contact']['phone'] = phone[0] return user
def _add_user(data: dict) -> dict: """ Add an user's contact. :param data: a dictionary contains user information :return: an user """ user = create_user() name = [] if 'first_name' in data: name.append(data['first_name']) if 'middle_name' in data: name.append(data['middle_name']) if 'last_name' in data: name.append(data['last_name']) user['name'] = ' '.join(name) if 'role' in data: user['exp']['exp']['title'] = data['role'] if 'affiliation' in data: user['abs'] = data['affiliation'] user['exp']['exp']['company'] = data['affiliation'] elif 'organization' in data: user['abs'] = data['organization'] user['exp']['exp']['company'] = data['organization'] phone = [] if 'phone' in data: phone.append(data['phone']) if 'phone_ext' in data: phone.append(data['phone_ext']) user['contact']['phone'] = '-'.join(phone) user['contact']['email'] = data['email'] if 'email' in data else '' if 'degrees' in data: if not user.title: user['edu']['degree'] = data['degrees'] if len(user['name']) < 0: user['name'] = user['contact']['email'] if len( user['contact']['email']) > 0 else 'Anonymous' return user
def add_inventors(self, response: Response) -> list: """ Add inventors to the project. :param response: Response object :return a list of inventors """ inventors = [] inventor_found = False for row in response.xpath("//div[@id='alt_toolbox']/*"): if inventor_found: for name in row.xpath('li/text()').getall(): if len(name) < 1: continue user = create_user() user['name'] = name user['exp']['exp']['company'] = self.get_name(response) inventors.append(user) self.log('Found inventor {}'.format(user['name']), level=logging.DEBUG) break if row.xpath('name()').get() == 'h2' and row.xpath('string()').get() == 'Researchers': inventor_found = True return inventors
def add_inventors(self, response: Response) -> list: """ Add inventors to the project. :param response: Response object :return a list of inventors """ inventors = [] for row in response.xpath( "//div[contains(@class,'field field-name-body field-type-text-with-summary')]/div/div/p[1]/a" ): name = row.xpath("text()").get() link = row.xpath("@href").get() if len(name) < 1: continue user = create_user() user['name'] = name user['ref'] = link user['contact']['website'] = user['ref'] user['exp']['exp']['company'] = self.name inventors.append(user) self.log('Found inventor {}'.format(user['name']), level=logging.DEBUG) return inventors
def parse_html(file): document = etree.parse(file, etree.HTMLParser()) product = create_product() data_english = parse(document, 'cn') data_chinese = parse(document, 'en') product['name'] = data_chinese[u'注册题目'] product['abs'] = data_chinese[u'研究目的'] product['asset']['stat'] = map_status(data_english['Recruiting status']) product['intro'] = data_chinese['药物成份或治疗方案详述'] href = document.xpath("//body/div[4]/div[2]/a") product['ref'] = 'http://www.chictr.org.cn/' + (href[0].attrib['href'] if len(href) > 0 else '') product['tag'].append(data_chinese[u'研究疾病']) product['tag'].append(data_english[u'Target disease']) product['tag'].append(data_chinese[u'研究疾病代码']) product['tag'].append(data_english[u'Target disease code']) product['tag'].append(data_chinese[u'研究类型']) product['tag'].append(data_english[u'Study type']) product['tag'].append(data_chinese[u'研究所处阶段']) product['tag'].append(data_english[u'Study phase']) product['tag'].append(data_chinese[u'研究类型']) product['tag'].append(data_english[u'Study type']) product['tag'] = remove_empty_string(product['tag']) product['asset']['lic'].append(data_chinese['研究课题代号(代码)']) product['asset']['lic'].append(data_chinese['注册号']) product['asset']['lic'].append(data_chinese['伦理委员会批件文号']) product['asset']['lic'] = remove_empty_string(product['asset']['lic']) product['asset']['type'] = 2 try: product['created'] = parser.parse( data_english['Date of Registration']).strftime( "%a, %d %b %Y %H:%M:%S GMT") except: pass try: product['updated'] = parser.parse( data_english['Date of Last Refreshed on']).strftime( "%a, %d %b %Y %H:%M:%S GMT") except: pass product['asset']['tech'] = dictionary_to_markdown(data_english, [ 'Study design', 'Inclusion criteria', 'Exclusion criteria', 'Study execute time', 'Interventions', 'Countries of recruitment and research settings', 'Outcomes', 'Collecting sample(s) from participants', 'Participant age', 'Gender', 'Randomization Procedure (please state who generates the random number sequence and by what method)', 'Blinding', 'The time of sharing IPD', 'The way of sharing IPD”(include metadata and protocol, If use web-based public database, please provide the url)', 'Data collection and Management (A standard data collection and management system include a CRF and an electronic data capture', 'Data Managemen Committee' ]) product['asset']['tech'] += dictionary_to_markdown(data_chinese, [ '研究设计', '纳入标准', '排除标准', '研究实施时间', '干预措施', '研究实施地点', '测量指标', '采集人体标本', '年龄范围', '性别', '随机方法(请说明由何人用什么方法产生随机序列)', '盲法', '原始数据公开时间', '共享原始数据的方式(说明:请填入公开原始数据日期和方式,如采用网络平台,需填该网络平台名称和网址)', '数据采集和管理(说明:数据采集和管理由两部分组成,一为病例记录表(Case Record Form, CRF),二为电子采集和管理系统(Electronic Data Capture, EDC),如ResMan即为一种基于互联网的EDC', '数据管理委员会' ]) applicant = create_user() applicant['name'] = data_chinese[u'申请注册联系人'] applicant['abs'] = 'Applicant' applicant['contact']['phone'] = data_chinese[u'申请注册联系人电话'] applicant['contact']['email'] = data_chinese[u'申请注册联系人电子邮件'] applicant['contact']['website'] = data_chinese[u'申请单位网址(自愿提供)'] applicant['addr'] = parse_address(data_english[u'Applicant address']) applicant['addr']['zip'] = data_chinese[u'申请注册联系人邮政编码'] applicant['exp']['exp']['company'] = data_chinese[u'申请人所在单位'] principal_investigator = create_user() principal_investigator['name'] = data_chinese[u'研究负责人'] principal_investigator['abs'] = 'Principal Investigator' principal_investigator['contact']['phone'] = data_chinese[u'研究负责人电话'] principal_investigator['contact']['email'] = data_chinese[u'研究负责人电子邮件'] principal_investigator['contact']['website'] = data_chinese[ u'研究负责人网址(自愿提供)'] principal_investigator['addr'] = parse_address( data_english[u"Study leader's address"]) principal_investigator['addr']['zip'] = data_chinese[u'研究负责人邮政编码'] product['addr'] = copy.deepcopy(applicant['addr']) return { 'product': product, 'applicant': applicant, 'principal_investigator': principal_investigator }