Esempio n. 1
0
def parserMonthly():
    unparsed_monthly = MonthlyLink.objects.filter(enable=True)
    #unparsed_monthly = MonthlyLink.objects.all()
    for monthly in unparsed_monthly:
        r = urlparse.urlsplit(monthly.link)
        HTML = ''
        if monthly.raw_desc:
            HTML = monthly.raw_desc
        else:
            HTML = getHTML(monthly.link)
            monthly.raw_desc = HTML
            #monthly.save()

        if HTML:
            mcp = MonthlyCollectionParser()
            mcp.feed(HTML)

            for link in mcp.links:
                link_url, link_title = link
                link_url = urlparse.urlunsplit(('http', r.netloc, link_url, '', ''))
                daily = DailyLink(monthly_link=monthly, link=link_url, label=link_title)
                try:
                    daily.save()
                except IntegrityError:
                    # same move exists.
                    pass
        monthly.enable = False
        monthly.save()
Esempio n. 2
0
def parserMonthly():
    unparsed_monthly = MonthlyLink.objects.filter(enable=True)
    #unparsed_monthly = MonthlyLink.objects.all()
    for monthly in unparsed_monthly:
        r = urlparse.urlsplit(monthly.link)
        HTML = ''
        if monthly.raw_desc:
            HTML = monthly.raw_desc
        else:
            HTML = getHTML(monthly.link)
            monthly.raw_desc = HTML
            #monthly.save()

        if HTML:
            mcp = MonthlyCollectionParser()
            mcp.feed(HTML)

            for link in mcp.links:
                link_url, link_title = link
                link_url = urlparse.urlunsplit(
                    ('http', r.netloc, link_url, '', ''))
                daily = DailyLink(monthly_link=monthly,
                                  link=link_url,
                                  label=link_title)
                try:
                    daily.save()
                except IntegrityError:
                    # same move exists.
                    pass
        monthly.enable = False
        monthly.save()
Esempio n. 3
0
def parseMonth(month_url):
    #logger.info("Parse Month:" + str(month_url.link))
    url = urlparse.urlsplit(month_url.link)
    servername = url[0] + "://" + url[1]
    try:
        page = urllib2.urlopen(month_url.link)
    except URLError:
        raise
    soup = BeautifulSoup(page, fromEncoding='gbk')
    #content = soup.prettify()
    links = soup.findAll('a', {'href': True, 'target': True}, True)

    count = 0
    parsed_count = 0
    reobj = re.compile(u"^★㊣最新の[(日本)(亚洲)](.)*♂(.)*♀$")

    for link in links:
        content = link.getText()
        match = reobj.search(content)
        if match:
            count = count + 1
            logger.info(content)
            #存储日常链接
            linkstr = servername + link.get('href', '')
            dailyLink = DailyLink(link=linkstr,
                                  monthly_link=month_url,
                                  label=content)
            try:
                dailyLink.save()
                parsed_count = parsed_count + 1
            except IntegrityError:
                logger.info("URL already existed:...." + linkstr)
                pass
            if count > 3:
                #only parse 2 links every time
                break
        else:
            logger.info(content + " not match!")
            continue
    return parsed_count
Esempio n. 4
0
def parseMonth(month_url):
    #logger.info("Parse Month:" + str(month_url.link))
    url = urlparse.urlsplit(month_url.link)
    servername = url[0]+"://"+url[1]    
    try:
        page = urllib2.urlopen(month_url.link)
    except URLError:        
        raise
    soup = BeautifulSoup(page,fromEncoding='gbk')
    #content = soup.prettify()
    links = soup.findAll('a', {'href':True,'target':True},True)
    
    count = 0
    parsed_count = 0
    reobj = re.compile(u"^★㊣最新の[(日本)(亚洲)](.)*♂(.)*♀$")
        
    for link in links:            
        content = link.getText()                
        match = reobj.search(content)
        if match:
            count = count+1
            logger.info(content)
            #存储日常链接            
            linkstr = servername+link.get('href','')
            dailyLink = DailyLink(link=linkstr,monthly_link=month_url,label=content)
            try:
                dailyLink.save()
                parsed_count = parsed_count + 1
            except IntegrityError:
                logger.info("URL already existed:...." + linkstr)
                pass    
            if count > 3:
                #only parse 2 links every time
                break 
        else:
            logger.info(content+" not match!")
            continue
    return parsed_count