def main(self):

        # Searching deliverable page
        if self.opt['page']:
            self.links = [self.opt_url]
        else:
            self.links = self.pagesearch.get_deliverable_page()

            ##################################
            if self.links[0] == -1:
                return self.links

            if self.opt['lookup_page']:
                return (1, self.links)

            if self.opt['verbose']:
                print "*" * 80
                print "Deliverable page: ", " ".join(self.links)
                print "*" * 80

        pr = RRSProject()

        #Project - Url relationship
        if not self.opt['page']:
            pr_url = RRSUrl(link=self.opt_url)
            pr_url_rel = RRSRelationshipProjectUrl()
            pr_url_rel.set_entity(pr_url)
            pr['url'] = pr_url_rel

        self.recordhandler.process_pages(self.links)

        records = self.recordhandler.get_deliverables()

        if type(records) == list:
            #create relationship Project Publication
            for r in records:
                rel = RRSRelationshipPublicationProject()
                #print unicode(r['title'])
                rel.set_entity(r)
                pr['publication'] = rel
            #create XML from RRSProject
            output = StringIO.StringIO()
            converter = Model2XMLConverter(stream=output)
            converter.convert(pr)
            out = output.getvalue()
            output.close()
            #Either return RRSProject object or XML in string or store result into a file
            if self.opt['storefile']:

                r = self._storeToFile(self.opt_url, out)
                #test if store ok
                if r[0] != 1:
                    print r[1]

            else:
                print out.encode('UTF-8')
            return pr

        else:
            return records
Example #2
0
    def _deliv_in_link(self,text,links,entry = False):
        
        ##print text
        ##print links
        #print "*"*40
        
        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I)

        _link = False
        
        for l in links:
            if pattern.search(l):
                  if _link == False:
                     _link =l
                  else:
                     return ['-3','Probably more records in one entry']



        #loop through text in entry looking for title and description
        for t in text:
           if _title == False:
                if len(t)>10 :
                     _title = t
           #set the longest string as description of deliverable
           if len(_description)<len(t):
                _description = t
             

        if _title == _description:
            _description = ""

        #if chosen title is not valid try to find better in parent entry
        if _title and not self._check_title(_title) and entry != False:
            _title = self._repair_title(entry)        
       
        
        #create object
        if _link:
            pub = RRSPublication(title=_title,abstract=_description)
            typ = RRSPublication_type(type='techreport')
            pub['type'] = typ

            self.__debug("*"*40)
            self.__debug("Title: "+_title)
            self.__debug("Description: "+_description)
            
            self.__debug("Link: "+_link)
            l = RRSUrl(link=_link)
            pl_rel = RRSRelationshipPublicationUrl()
            pl_rel.set_entity(l)
            pub['url'] = pl_rel
            
            return pub
        else:
            #this entry is not probably deliverable
            return False
Example #3
0
 def _more_entry_in_record(self, entry):
     for ch in entry.iter('chunk'):
         if ch.text != None and ch.attrib.get("link") != None:
             if self.agent.is_wanted_mime(ch.attrib.get("link")):
                 _pub = RRSPublication(title=ch.text)
                 typ = RRSPublication_type(type='techreport')
                 _pub['type'] = typ
                 _l = RRSUrl(link=ch.attrib.get("link"))
                 _rel = RRSRelationshipPublicationUrl()
                 _rel.set_entity(_l)
                 _pub['url'] = _rel
                 self._entriesFoundInLinks.append(_pub)
Example #4
0
    def _deliv_in_text(self, text, links):

        #print text
        #print links
        #print "*"*40
        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",
                             re.I)

        #loop through text in entry looking for title and description
        for t in text:
            if _title == False:
                if pattern.search(t):
                    _title = t

            #set the longest string as description of deliverable
            if len(_description) < len(t):
                _description = t

        if _title == _description:
            _description = ""

        _link = False

        if type(links) == str:
            if self.agent.is_wanted_mime(links):
                _link = links
        elif type(links) == list:
            for l in links:
                if self.agent.is_wanted_mime(l):
                    if _link == False:
                        _link = l
                    else:
                        #if there was already found link
                        if _link[:s.rfind(_link, '.')] == l[:s.rfind(l, '.')]:
                            break
                        else:
                            return ['-3', 'Probably more records in one entry']

        #create object
        if _title:
            #print "TITLE:"+_title
            pub = RRSPublication(title=_title, abstract=_description)
            _typ = RRSPublication_type(type='techreport')
            pub['type'] = _typ
            self.__debug("*" * 40)
            self.__debug("Title: " + _title)
            self.__debug("Description: " + _description)

            if _link:
                #print "LINK:"+_link
                self.__debug("Link: " + _link)
                l = RRSUrl(link=_link)
                pl_rel = RRSRelationshipPublicationUrl()
                pl_rel.set_entity(l)
                pub['url'] = pl_rel

            return pub
        else:
            #this entry is not probably deliverable
            return False
    def test(self, dir, tb=False, remove=False, copy=True, wait=False, output_file=None):
        import traceback
        import codecs

        ok = True
        errors = []

        files = self._get_files_in_dir(dir, "txt")
        len_files = len(files)

        print "Testing AME on " + str(len_files) + " files:"

        print "\tInitializing AME......",
        sys.stdout.flush()
        try:
            ame = ArticleMetaExtractor()
            print "\t[OK]"
        except:
            print "\t[FAILED]"
            sys.stderr.write(str(sys.exc_info()))
            ok = False

        if ok:
            c = 0
            for f in files:
                c += 1
                print "\t" + str(c) + "/" + str(len_files) + "\t" + f + ".....",
                sys.stdout.flush()
                try:
                    #fileObj = codecs.open(f, "r", "utf-8")
                    #document = str(fileObj.read())
                    document = open(f, 'r').read()
                    rrsf = RRSFile()
                    _rel = RRSRelationshipFileUrl()
                    _rel.set_entity(RRSUrl(link=f))
                    rrsf.set('url', _rel)
                    fname = f.split('/')
                    fname.reverse()
                    rrsf.set('filename', fname[0])
                    rrsf.set('type', 'txt')
                    publ = ame.extract_data(document, module="publication_text_data",
                                            files=[rrsf])
                    output = StringIO.StringIO()
                    converter = Model2XMLConverter(stream=output)
                    converter.convert(publ)
                    #exit()
                    if remove:
                        os.system("rm " + f)
                    if output_file != None:
                        of = open(output_file, 'w')
                        of.write(output.getvalue())
                        of.flush()
                        of.close()
                    print "\t[OK]"
                except:
                    print "\t[FAILED]"
                    print "\t###############################################################################################"
                    sys.stdout.flush()
                    if tb:
                        traceback.print_tb(sys.exc_info()[2])
                    elif copy:
                        os.system("cp " + f + " " + "/media/Data/RRS/files/txt/ame_errors/" + str(c) + ".txt")

                    print sys.exc_info()[0], sys.exc_info()[1]
                    sys.stderr.flush()
                    print "\t###############################################################################################"
                    print
                    ok = False
                    errors.append(f)
                if wait:
                    raw_input("...continue?")
        if ok:
            print 'exiting test, everything is OK...'
        else:
            print 'exiting test, some error occured...'
            print '\nFiles with error:'
            print errors