Ejemplo n.º 1
0
 def get_tag_by_link(link):
     url = helpers.get_url_from_link(link).strip('/').split('/')[0]
     if not Retriever.TAGS_MAPPING:
         dm = WaitingDM()
         print 'downloading main page of LibRu for retrieving tags...'
         html = dm.download( helpers.get_site_root_link(link) )
         soup = get_soup(html)
         dirs = Retriever.get_dirs(soup)
         for link, tag in dirs:
             Retriever.TAGS_MAPPING[link.strip('/')] = tag
     if not Retriever.TAGS_MAPPING.has_key(url):
         #TODO make other way for retrieving of tags for this case
         logger.write_fail("LibRu parser: can't find tag in main page",link=link, url=url)
         return None
     return Retriever.TAGS_MAPPING[url]
Ejemplo n.º 2
0
def refresh_libru():
    print 'scanning what should be refreshed...'
    refreshes = Refresh.objects.all()
    links = []
    ref_dm = WaitingDM()
    for refresh in refreshes:
        headers, page = ref_dm.download_headers(refresh.link)
        print refresh.link,
        field = LAST_MODIFIED
        if field in headers:
            new_date = headers[field]
            if refresh.check_refreshable(new_date):
                links.append( (refresh.link, new_date) )
                print '--> REFRESH',
        else:
            print ' no', field, 'in headers!',
        print 
    if not links:
        print 'nothing to refresh. Everything is up-to-date'

    parser = LibRu()
    parser_name = parser.get_filename()+'_refresh'
    storage = TaskStorageDB( parser_name, [],Q(parser_name=parser_name, good=True)  )

    for link, date in links:
        refresh = Refresh.objects.get(link=link)
        refresh.delete()
#TODO don't forget about make last-modified links actual
        # now actual is when it'll be updated by again scanning
#        refresh.last_modified = date
        task = DirPage(link)
        print 'adding task for refresh:', task
        storage.accept_new_tasks( [task] )

    dm = DM_LibRu_AddRefreshLinks()
    tm = TM_LibRuRefresh(storage, dm)
    tm.run()
    print 'refreshing is finished'
Ejemplo n.º 3
0
 def read_data(self, page):
    if self.bookfile_condition(page.url):
        return page.read( analyser.settings.LIBRU_DEFINE_LANG_BY_BYTES )
    return WaitingDM.read_data(self,page)