def extract_from_full(self, url): ''' Purpose: Extract the full info when html is available ''' title = '' abstract = '' methodology = '' acknowledgments = '' si = self.solr thisArticle = webdriver.Firefox() #Remote(command_executor='http://127.0.0.1:4444/wd/hub', desired_capabilities=DesiredCapabilities.FIREFOX) #Firefox() thisArticle.get(url) time.sleep(3) pageMetaData = parser.parse1('meta',url)[1] title = thisArticle.title try: abstract = (thisArticle.find_element_by_id("abstract").find_element_by_tag_name('p')).text.encode('utf-8') sections = thisArticle.find_elements_by_xpath("//article[contains(@id,'main-content')]/section[contains(@id,'eft')]") for section in sections: try: if section.find_element_by_tag_name('h2').text in 'Acknowledgments': for i in section.find_elements_by_tag_name('p'): acknowledgments += i.text.encode('utf-8') except: try: if True in [i in ['data', 'methodology', 'method'] for i in (section.find_element_by_tag_name('h2').text).lower().split(' ')]: # methodology = section.find_element_by_tag_name('p').text.encode('utf-8') for i in section.find_elements_by_tag_name('p'): methodology += i.text.encode('utf-8') except: continue except: # this article is not a 'research' document, maybe like http://onlinelibrary.wiley.com/doi/10.1002/2015EF000307/full print 'skipping %s' %url self.f.write('skipping %s\n' %url) #TODO: place extractors here to get spatial & temporal data from extracted text to index thisArticle.close() thisArticleJSON = {'id':url, 'title': title, 'abstract':abstract, 'methodology':methodology, 'acknowledgment':acknowledgments} #add pageMetadata # thisArticleJSON.update(json.loads(pageMetaData)) partMeta = {} i = json.loads(pageMetaData.encode('utf-8')) partMeta = {"citation_author":i["citation_author"], "article_references":i["article_references"].encode('utf-8'),"citation_author_institution":i["citation_author_institution"], \ "citation_doi":i["citation_doi"].encode('utf-8'), "citation_journal_title":i["citation_journal_title"].encode('utf-8'),\ "citation_keywords":i["citation_keywords"],"citation_publisher":i["citation_publisher"].encode('utf-8'), "citation_online_date":i["citation_online_date"].encode('utf-8')} thisArticleJSON.update(partMeta) #index data into solr si.add(thisArticleJSON) self.f.write('added entry to solr DB\n')
''' if 'Date/Time Original' in metadata: postDate = metadata['Date/Time Original'] else postDate ='' else: content_type = '' description='' itemKeywords='' title='' ''' use tika-server to get content in plain text. A new .geot file is created with contents in plain text, which will be sent to tika-server with geo-parsing capabilitites to extract geo related metadata ''' textResponse = parser.parse1('text', curFilePath) if not textResponse: parsedText ='' else: parsedText = textResponse[1] ''' to parse out 'posted date' of gun ads for ad if we are dealing with html pages ''' if postDate == '': postDateStart = parsedText.find('Posted') if(postDateStart!=-1): postDate = parsedText[postDateStart+8:postDateStart+40] else: postDate = '1900-01-01 00:00:00' else
def extract_from_full(self, url): ''' Purpose: Extract the full info when html is available ''' title = '' abstract = '' methodology = '' acknowledgments = '' si = self.solr thisArticle = webdriver.PhantomJS() thisArticle.get(url) time.sleep(3) pageMetaData = parser.parse1('meta',url)[1] title = thisArticle.title try: abstract = (thisArticle.find_element_by_id("abstract").find_element_by_tag_name('p')).text.encode('utf-8') sections = thisArticle.find_elements_by_xpath("//article[contains(@id,'main-content')]/section[contains(@id,'eft')]") for section in sections: try: if section.find_element_by_tag_name('h2').text in 'Acknowledgments': for i in section.find_elements_by_tag_name('p'): acknowledgments += i.text.encode('utf-8') except: try: if True in [i in ['data', 'methodology', 'method'] for i in (section.find_element_by_tag_name('h2').text).lower().split(' ')]: # methodology = section.find_element_by_tag_name('p').text.encode('utf-8') for i in section.find_elements_by_tag_name('p'): methodology += i.text.encode('utf-8') except: continue except: # this article is not a 'research' document, maybe like http://onlinelibrary.wiley.com/doi/10.1002/2015EF000307/full print 'skipping %s' %url self.f.write('skipping %s\n' %url) #TODO: place extractors here to get spatial & temporal data from extracted text to index thisArticle.close() thisArticleJSON = {'id':url, 'title': title, 'abstract':abstract, 'methodology':methodology, 'acknowledgment':acknowledgments} #add pageMetadata # thisArticleJSON.update(json.loads(pageMetaData)) partMeta = {} i = json.loads(pageMetaData.encode('utf-8')) partMeta = {"citation_author":i["citation_author"], "article_references":i["article_references"].encode('utf-8'),"citation_author_institution":i["citation_author_institution"], \ "citation_doi":i["citation_doi"].encode('utf-8'), "citation_journal_title":i["citation_journal_title"].encode('utf-8'),\ "citation_keywords":i["citation_keywords"],"citation_publisher":i["citation_publisher"].encode('utf-8'), "citation_online_date":i["citation_online_date"].encode('utf-8')} thisArticleJSON.update(partMeta) #save json file if not os.path.exists(os.getcwd()+'/jsonFiles'): os.makedirs(os.getcwd()+'/jsonFiles') filenameJSON = os.getcwd()+'/jsonFiles/'+url.split('://')[1].replace('/','-')+'.json' with open(filenameJSON, 'w+') as f: json.dump(thisArticleJSON,f) #index data into solr if self.solrIntegration == True: si.add(thisArticleJSON) self.f.write('added entry to solr DB\n')