def main(self): # Searching deliverable page if self.opt['page']: self.links = [self.opt_url] else: self.links = self.pagesearch.get_deliverable_page() ################################## if self.links[0] == -1: return self.links if self.opt['lookup_page']: return (1, self.links) if self.opt['verbose']: print "*" * 80 print "Deliverable page: ", " ".join(self.links) print "*" * 80 pr = RRSProject() #Project - Url relationship if not self.opt['page']: pr_url = RRSUrl(link=self.opt_url) pr_url_rel = RRSRelationshipProjectUrl() pr_url_rel.set_entity(pr_url) pr['url'] = pr_url_rel self.recordhandler.process_pages(self.links) records = self.recordhandler.get_deliverables() if type(records) == list: #create relationship Project Publication for r in records: rel = RRSRelationshipPublicationProject() #print unicode(r['title']) rel.set_entity(r) pr['publication'] = rel #create XML from RRSProject output = StringIO.StringIO() converter = Model2XMLConverter(stream=output) converter.convert(pr) out = output.getvalue() output.close() #Either return RRSProject object or XML in string or store result into a file if self.opt['storefile']: r = self._storeToFile(self.opt_url, out) #test if store ok if r[0] != 1: print r[1] else: print out.encode('UTF-8') return pr else: return records
def _deliv_in_link(self,text,links,entry = False): ##print text ##print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I) _link = False for l in links: if pattern.search(l): if _link == False: _link =l else: return ['-3','Probably more records in one entry'] #loop through text in entry looking for title and description for t in text: if _title == False: if len(t)>10 : _title = t #set the longest string as description of deliverable if len(_description)<len(t): _description = t if _title == _description: _description = "" #if chosen title is not valid try to find better in parent entry if _title and not self._check_title(_title) and entry != False: _title = self._repair_title(entry) #create object if _link: pub = RRSPublication(title=_title,abstract=_description) typ = RRSPublication_type(type='techreport') pub['type'] = typ self.__debug("*"*40) self.__debug("Title: "+_title) self.__debug("Description: "+_description) self.__debug("Link: "+_link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False
def _more_entry_in_record(self, entry): for ch in entry.iter('chunk'): if ch.text != None and ch.attrib.get("link") != None: if self.agent.is_wanted_mime(ch.attrib.get("link")): _pub = RRSPublication(title=ch.text) typ = RRSPublication_type(type='techreport') _pub['type'] = typ _l = RRSUrl(link=ch.attrib.get("link")) _rel = RRSRelationshipPublicationUrl() _rel.set_entity(_l) _pub['url'] = _rel self._entriesFoundInLinks.append(_pub)
def _deliv_in_text(self, text, links): #print text #print links #print "*"*40 _title = False _description = "" pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)", re.I) #loop through text in entry looking for title and description for t in text: if _title == False: if pattern.search(t): _title = t #set the longest string as description of deliverable if len(_description) < len(t): _description = t if _title == _description: _description = "" _link = False if type(links) == str: if self.agent.is_wanted_mime(links): _link = links elif type(links) == list: for l in links: if self.agent.is_wanted_mime(l): if _link == False: _link = l else: #if there was already found link if _link[:s.rfind(_link, '.')] == l[:s.rfind(l, '.')]: break else: return ['-3', 'Probably more records in one entry'] #create object if _title: #print "TITLE:"+_title pub = RRSPublication(title=_title, abstract=_description) _typ = RRSPublication_type(type='techreport') pub['type'] = _typ self.__debug("*" * 40) self.__debug("Title: " + _title) self.__debug("Description: " + _description) if _link: #print "LINK:"+_link self.__debug("Link: " + _link) l = RRSUrl(link=_link) pl_rel = RRSRelationshipPublicationUrl() pl_rel.set_entity(l) pub['url'] = pl_rel return pub else: #this entry is not probably deliverable return False
def test(self, dir, tb=False, remove=False, copy=True, wait=False, output_file=None): import traceback import codecs ok = True errors = [] files = self._get_files_in_dir(dir, "txt") len_files = len(files) print "Testing AME on " + str(len_files) + " files:" print "\tInitializing AME......", sys.stdout.flush() try: ame = ArticleMetaExtractor() print "\t[OK]" except: print "\t[FAILED]" sys.stderr.write(str(sys.exc_info())) ok = False if ok: c = 0 for f in files: c += 1 print "\t" + str(c) + "/" + str(len_files) + "\t" + f + ".....", sys.stdout.flush() try: #fileObj = codecs.open(f, "r", "utf-8") #document = str(fileObj.read()) document = open(f, 'r').read() rrsf = RRSFile() _rel = RRSRelationshipFileUrl() _rel.set_entity(RRSUrl(link=f)) rrsf.set('url', _rel) fname = f.split('/') fname.reverse() rrsf.set('filename', fname[0]) rrsf.set('type', 'txt') publ = ame.extract_data(document, module="publication_text_data", files=[rrsf]) output = StringIO.StringIO() converter = Model2XMLConverter(stream=output) converter.convert(publ) #exit() if remove: os.system("rm " + f) if output_file != None: of = open(output_file, 'w') of.write(output.getvalue()) of.flush() of.close() print "\t[OK]" except: print "\t[FAILED]" print "\t###############################################################################################" sys.stdout.flush() if tb: traceback.print_tb(sys.exc_info()[2]) elif copy: os.system("cp " + f + " " + "/media/Data/RRS/files/txt/ame_errors/" + str(c) + ".txt") print sys.exc_info()[0], sys.exc_info()[1] sys.stderr.flush() print "\t###############################################################################################" print ok = False errors.append(f) if wait: raw_input("...continue?") if ok: print 'exiting test, everything is OK...' else: print 'exiting test, some error occured...' print '\nFiles with error:' print errors