Beispiel #1
0
    def extract_from_full(self, url):
        '''
        Purpose: Extract the full info when html is available
        '''
        title = ''
        abstract = ''
        methodology = ''
        acknowledgments = ''

        si = self.solr

        thisArticle = webdriver.Firefox() #Remote(command_executor='http://127.0.0.1:4444/wd/hub', desired_capabilities=DesiredCapabilities.FIREFOX) #Firefox()
        thisArticle.get(url)
        time.sleep(3)

        pageMetaData = parser.parse1('meta',url)[1]
        title = thisArticle.title

        try:
            abstract = (thisArticle.find_element_by_id("abstract").find_element_by_tag_name('p')).text.encode('utf-8')
            sections = thisArticle.find_elements_by_xpath("//article[contains(@id,'main-content')]/section[contains(@id,'eft')]")
        
            for section in sections:
                try:
                    if section.find_element_by_tag_name('h2').text in 'Acknowledgments':
                        for i in section.find_elements_by_tag_name('p'): 
                            acknowledgments += i.text.encode('utf-8')
                except:
                    try:
                        if True in [i in ['data', 'methodology', 'method'] for i in (section.find_element_by_tag_name('h2').text).lower().split(' ')]:
                            # methodology = section.find_element_by_tag_name('p').text.encode('utf-8')
                            for i in section.find_elements_by_tag_name('p'):
                                methodology += i.text.encode('utf-8')
                    except:
                        continue
        except:
            # this article is not a 'research' document, maybe like http://onlinelibrary.wiley.com/doi/10.1002/2015EF000307/full
            print 'skipping %s' %url
            self.f.write('skipping %s\n' %url)

        #TODO: place extractors here to get spatial & temporal data from extracted text to index
        thisArticle.close()

        thisArticleJSON = {'id':url, 'title': title, 'abstract':abstract, 'methodology':methodology, 'acknowledgment':acknowledgments}
        
        #add pageMetadata 
        # thisArticleJSON.update(json.loads(pageMetaData))
        partMeta = {}
        i = json.loads(pageMetaData.encode('utf-8'))
        partMeta = {"citation_author":i["citation_author"], "article_references":i["article_references"].encode('utf-8'),"citation_author_institution":i["citation_author_institution"], \
            "citation_doi":i["citation_doi"].encode('utf-8'), "citation_journal_title":i["citation_journal_title"].encode('utf-8'),\
            "citation_keywords":i["citation_keywords"],"citation_publisher":i["citation_publisher"].encode('utf-8'), "citation_online_date":i["citation_online_date"].encode('utf-8')}
        thisArticleJSON.update(partMeta)

        #index data into solr
        si.add(thisArticleJSON)
        self.f.write('added entry to solr DB\n')
		'''
		if 'Date/Time Original' in metadata:
			postDate = metadata['Date/Time Original']
		else 
			postDate =''
	else:
		content_type = ''
		description=''
		itemKeywords=''
		title=''

	'''
		use tika-server to get content in plain text. A new .geot file is created with contents in plain text, which will be sent to tika-server with geo-parsing
		capabilitites to extract geo related metadata
	'''
	textResponse = parser.parse1('text', curFilePath)
	if not textResponse:
		parsedText =''
	else:
		parsedText = textResponse[1]

	'''
		to parse out 'posted date' of gun ads for ad if we are dealing with html pages
	'''
	if postDate == '':
		postDateStart = parsedText.find('Posted')
		if(postDateStart!=-1):
			postDate = parsedText[postDateStart+8:postDateStart+40]
		else:
			postDate = '1900-01-01 00:00:00'
	else 	
Beispiel #3
0
    def extract_from_full(self, url):
        '''
        Purpose: Extract the full info when html is available
        '''
        title = ''
        abstract = ''
        methodology = ''
        acknowledgments = ''

        si = self.solr

        thisArticle = webdriver.PhantomJS() 
        thisArticle.get(url)
        time.sleep(3)

        pageMetaData = parser.parse1('meta',url)[1]
        title = thisArticle.title

        try:
            abstract = (thisArticle.find_element_by_id("abstract").find_element_by_tag_name('p')).text.encode('utf-8')
            sections = thisArticle.find_elements_by_xpath("//article[contains(@id,'main-content')]/section[contains(@id,'eft')]")
        
            for section in sections:
                try:
                    if section.find_element_by_tag_name('h2').text in 'Acknowledgments':
                        for i in section.find_elements_by_tag_name('p'): 
                            acknowledgments += i.text.encode('utf-8')
                except:
                    try:
                        if True in [i in ['data', 'methodology', 'method'] for i in (section.find_element_by_tag_name('h2').text).lower().split(' ')]:
                            # methodology = section.find_element_by_tag_name('p').text.encode('utf-8')
                            for i in section.find_elements_by_tag_name('p'):
                                methodology += i.text.encode('utf-8')
                    except:
                        continue
        except:
            # this article is not a 'research' document, maybe like http://onlinelibrary.wiley.com/doi/10.1002/2015EF000307/full
            print 'skipping %s' %url
            self.f.write('skipping %s\n' %url)

        #TODO: place extractors here to get spatial & temporal data from extracted text to index
        thisArticle.close()

        thisArticleJSON = {'id':url, 'title': title, 'abstract':abstract, 'methodology':methodology, 'acknowledgment':acknowledgments}
        
        #add pageMetadata 
        # thisArticleJSON.update(json.loads(pageMetaData))
        partMeta = {}
        i = json.loads(pageMetaData.encode('utf-8'))
        partMeta = {"citation_author":i["citation_author"], "article_references":i["article_references"].encode('utf-8'),"citation_author_institution":i["citation_author_institution"], \
            "citation_doi":i["citation_doi"].encode('utf-8'), "citation_journal_title":i["citation_journal_title"].encode('utf-8'),\
            "citation_keywords":i["citation_keywords"],"citation_publisher":i["citation_publisher"].encode('utf-8'), "citation_online_date":i["citation_online_date"].encode('utf-8')}
        thisArticleJSON.update(partMeta)

        #save json file
        if not os.path.exists(os.getcwd()+'/jsonFiles'):
            os.makedirs(os.getcwd()+'/jsonFiles')
        filenameJSON = os.getcwd()+'/jsonFiles/'+url.split('://')[1].replace('/','-')+'.json'
        with open(filenameJSON, 'w+') as f:
            json.dump(thisArticleJSON,f)

        #index data into solr
        if self.solrIntegration == True:
            si.add(thisArticleJSON)
            self.f.write('added entry to solr DB\n')