Python cleanName Beispiele

Programmiersprache: Python

Namespace / Paketname: helpers

Methode / Funktion: cleanName

Beispiele auf hotexamples.com: 7

Python cleanName - 7 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die helpers.cleanName, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

Datei: parseit.py Projekt: brunnels/mylar

def GCDScraper(ComicName, ComicYear, Total, ComicID, quickmatch=None):
    NOWyr = datetime.date.today().year
    if datetime.date.today().month == 12:
        NOWyr = NOWyr + 1
        logger.fdebug("We're in December, incremented search Year to increase search results: " + str(NOWyr))
    comicnm = ComicName
    comicyr = ComicYear
    comicis = Total
    comicid = ComicID
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    #print ( "comichave: " + str(comicis) )
    #print ( "comicid: " + str(comicid) )
    comicnm = re.sub(' ', '+', comicnm)
    input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&order2=date&order3=&start_date=' + str(comicyr) + '-01-01&end_date=' + str(NOWyr) + '-12-31&series=' + str(comicnm) + '&is_indexed=None'
    response = urllib2.urlopen ( input )
    soup = BeautifulSoup ( response)
    cnt1 = len(soup.findAll("tr", {"class" : "listing_even"}))
    cnt2 = len(soup.findAll("tr", {"class" : "listing_odd"}))

    cnt = int(cnt1 + cnt2)

    #print (str(cnt) + " results")

    resultName = []
    resultID = []
    resultYear = []
    resultIssues = []
    resultURL = None
    n_odd = -1
    n_even = -1
    n = 0
    while ( n < cnt ):
        if n%2==0:
            n_even+=1
            resultp = soup.findAll("tr", {"class" : "listing_even"})[n_even]
        else:
            n_odd+=1
            resultp = soup.findAll("tr", {"class" : "listing_odd"})[n_odd]
        rtp = resultp('a')[1]
        resultName.append(helpers.cleanName(rtp.findNext(text=True)))
        #print ( "Comic Name: " + str(resultName[n]) )
        fip = resultp('a',href=True)[1]
        resultID.append(fip['href'])
        #print ( "ID: " + str(resultID[n]) )

        subtxt3 = resultp('td')[3]
        resultYear.append(subtxt3.findNext(text=True))
        resultYear[n] = resultYear[n].replace(' ','')
        subtxt4 = resultp('td')[4]
        resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
        resiss = resultIssues[n].find('issue')
        resiss = int(resiss)
        resultIssues[n] = resultIssues[n].replace('','')[:resiss]
        resultIssues[n] = resultIssues[n].replace(' ','')
        #print ( "Year: " + str(resultYear[n]) )
        #print ( "Issues: " + str(resultIssues[n]) )
        CleanComicName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', comicnm)
        CleanComicName = re.sub(' ', '', CleanComicName).lower()
        CleanResultName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', resultName[n])        
        CleanResultName = re.sub(' ', '', CleanResultName).lower()
        #print ("CleanComicName: " + str(CleanComicName))
        #print ("CleanResultName: " + str(CleanResultName))
        if CleanResultName == CleanComicName or CleanResultName[3:] == CleanComicName:
        #if resultName[n].lower() == helpers.cleanName(str(ComicName)).lower(): 
            #print ("n:" + str(n) + "...matched by name to Mylar!")
            #this has been seen in a few instances already, so trying to adjust.
            #when the series year is 2011, in gcd it might be 2012 due to publication
            #dates overlapping between Dec/11 and Jan/12. Let's accept a match with a 
            #1 year grace space, and then pull in the first issue to see the actual pub
            # date and if coincides with the other date..match it.
            if resultYear[n] == ComicYear or resultYear[n] == str(int(ComicYear)+1): 
                #print ("n:" + str(n) + "...matched by year to Mylar!")
                #print ( "Year: " + str(resultYear[n]) )
                #Occasionally there are discrepancies in comic count between
                #GCD and CV. 99% it's CV not updating to the newest issue as fast
                #as GCD does. Therefore, let's increase the CV count by 1 to get it
                #to match, any more variation could cause incorrect matching.
                #ie. witchblade on GCD says 159 issues, CV states 161.
                if int(resultIssues[n]) == int(Total) or int(resultIssues[n]) == int(Total)+1 or (int(resultIssues[n])+1) == int(Total):
                    #print ("initial issue match..continuing.")
                    if int(resultIssues[n]) == int(Total)+1:
                        issvariation = "cv"
                    elif int(resultIssues[n])+1 == int(Total):
                        issvariation = "gcd"
                    else:
                        issvariation = "no"
                        #print ("n:" + str(n) + "...matched by issues to Mylar!")
                        #print ("complete match!...proceeding")
                    TotalIssues = resultIssues[n]
                    resultURL = str(resultID[n])
                    rptxt = resultp('td')[6]
                    resultPublished = rptxt.findNext(text=True)
                    #print ("Series Published: " + str(resultPublished))
                    break
                
        n+=1
    # it's possible that comicvine would return a comic name incorrectly, or gcd
    # has the wrong title and won't match 100%...
    # (ie. The Flash-2011 on comicvine is Flash-2011 on gcd)
    # this section is to account for variations in spelling, punctuation, etc/
    basnumbs = {'one':1,'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9,'ten':10,'eleven':11,'twelve':12}
    if resultURL is None:
        #search for number as text, and change to numeric
        for numbs in basnumbs:
            #print ("numbs:" + str(numbs))
            if numbs in ComicName.lower():
                numconv = basnumbs[numbs]
                #print ("numconv: " + str(numconv))
                ComicNm = re.sub(str(numbs), str(numconv), ComicName.lower())
                #print ("comicname-reVISED:" + str(ComicNm))
                return GCDScraper(ComicNm, ComicYear, Total, ComicID)
                break
        if ComicName.lower().startswith('the '):
            ComicName = ComicName[4:]
            return GCDScraper(ComicName, ComicYear, Total, ComicID)        
        if ':' in ComicName: 
            ComicName = re.sub(':', '', ComicName)
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if '-' in ComicName:
            ComicName = re.sub('-', ' ', ComicName)
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if 'and' in ComicName.lower():
            ComicName = ComicName.replace('and', '&')
            return GCDScraper(ComicName, ComicYear, Total, ComicID)        
        if not quickmatch: return 'No Match'
    #vari_loop = 0
    if quickmatch == "yes":
        if resultURL is None: return 'No Match'
        else: return 'Match'
    return GCDdetails(comseries=None, resultURL=resultURL, vari_loop=0, ComicID=ComicID, TotalIssues=TotalIssues, issvariation=issvariation, resultPublished=resultPublished)

Beispiel #2

Datei anzeigen

Datei: parseit.py Projekt: brunnels/mylar

def ComChk(ComicName, ComicYear, ComicPublisher, Total, ComicID):
    comchkchoice = []
    comchoice = {}

    NOWyr = datetime.date.today().year
    if datetime.date.today().month == 12:
        NOWyr = NOWyr + 1
        logger.fdebug("We're in December, incremented search Year to increase search results: " + str(NOWyr))
    comicnm = ComicName
    comicyr = ComicYear
    comicis = Total
    comicid = ComicID
    comicpub = ComicPublisher
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    #print ( "comichave: " + str(comicis) )
    #print ( "comicpub: " + str(comicpub) )
    #print ( "comicid: " + str(comicid) )
    # do 3 runs at the comics.org search to get the best results
    comicrun = []
    # &pub_name=DC
    # have to remove the spaces from Publisher or else will not work (ie. DC Comics vs DC will not match)
    # take the 1st word ;)
    #comicpub = comicpub.split()[0]
    # if it's not one of the BIG publisher's it might fail - so let's increase the odds.
    pubbiggies = [ 'DC', 
                   'Marvel',
                   'Image',
                   'IDW' ]
    uhuh = "no"
    for pb in pubbiggies:
        if pb in comicpub:
            #keep publisher in url if a biggie.    
            uhuh = "yes"
            #print (" publisher match : " + str(comicpub))
            conv_pub = comicpub.split()[0]
            #print (" converted publisher to : " + str(conv_pub))
    #1st run setup - leave it all as it is.
    comicrun.append(comicnm)
    cruncnt = 0
    #2nd run setup - remove the last character and do a broad search (keep year or else will blow up)
    if len(str(comicnm).split()) > 2:
        comicrun.append(' '.join(comicnm.split(' ')[:-1]))
        cruncnt+=1
    # to increase the likely hood of matches and to get a broader scope...
    # lets remove extra characters
    if re.sub('[\.\,\:]', '', comicnm) != comicnm:
        comicrun.append(re.sub('[\.\,\:]', '', comicnm))
        cruncnt+=1
    totalcount = 0
    cr = 0
    #print ("cruncnt is " + str(cruncnt))
    while (cr <= cruncnt):
        #print ("cr is " + str(cr))
        comicnm = comicrun[cr]
        #leaving spaces in will screw up the search...let's take care of it
        comicnm = re.sub(' ', '+', comicnm)
        #print ("comicnm: " + str(comicnm))
        if uhuh == "yes":
            publink = "&pub_name=" + str(conv_pub)
        if uhuh == "no":
            publink = "&pub_name="
        input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&keywords=&order1=series&order2=date&order3=&start_date=' + str(comicyr) + '-01-01&end_date=' + str(NOWyr) + '-12-31' + '&title=&feature=&job_number=&pages=&script=&pencils=&inks=&colors=&letters=&story_editing=&genre=&characters=&synopsis=&reprint_notes=&story_reprinted=None&notes=' + str(publink) + '&pub_notes=&brand=&brand_notes=&indicia_publisher=&is_surrogate=None&ind_pub_notes=&series=' + str(comicnm) + '&series_year_began=&series_notes=&tracking_notes=&issue_count=&is_comics=None&format=&color=&dimensions=&paper_stock=&binding=&publishing_format=&issues=&volume=&issue_title=&variant_name=&issue_date=&indicia_frequency=&price=&issue_pages=&issue_editing=&isbn=&barcode=&issue_notes=&issue_reprinted=None&is_indexed=None'
        response = urllib2.urlopen ( input )
        soup = BeautifulSoup ( response)
        cnt1 = len(soup.findAll("tr", {"class" : "listing_even"}))
        cnt2 = len(soup.findAll("tr", {"class" : "listing_odd"}))

        cnt = int(cnt1 + cnt2)
#        print ("cnt1: " + str(cnt1))
#        print ("cnt2: " + str(cnt2))
#        print (str(cnt) + " results")

        resultName = []
        resultID = []
        resultYear = []
        resultIssues = []
        resultPublisher = []
        resultURL = None
        n_odd = -1
        n_even = -1
        n = 0
        while ( n < cnt ):
            if n%2==0:
                n_even+=1
                resultp = soup.findAll("tr", {"class" : "listing_even"})[n_even]
            else:
                n_odd+=1
                resultp = soup.findAll("tr", {"class" : "listing_odd"})[n_odd]
            rtp = resultp('a')[1]
            resultName.append(helpers.cleanName(rtp.findNext(text=True)))
#            print ( "Comic Name: " + str(resultName[n]) )

            pub = resultp('a')[0]
            resultPublisher.append(pub.findNext(text=True))
#            print ( "Publisher: " + str(resultPublisher[n]) )

            fip = resultp('a',href=True)[1]
            resultID.append(fip['href'])
#            print ( "ID: " + str(resultID[n]) )

            subtxt3 = resultp('td')[3]
            resultYear.append(subtxt3.findNext(text=True))
            resultYear[n] = resultYear[n].replace(' ','')
            subtxt4 = resultp('td')[4]
            resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
            resiss = resultIssues[n].find('issue')
            resiss = int(resiss)
            resultIssues[n] = resultIssues[n].replace('','')[:resiss]
            resultIssues[n] = resultIssues[n].replace(' ','')
#            print ( "Year: " + str(resultYear[n]) )
#            print ( "Issues: " + str(resultIssues[n]) )
#            print ("comchkchoice: " + str(comchkchoice))
            if not any(d.get('GCDID', None) == str(resultID[n]) for d in comchkchoice):
                #print ( str(resultID[n]) + " not in DB...adding.")
                comchkchoice.append({
                       "ComicID":         str(comicid),
                       "ComicName":       str(resultName[n]),
                       "GCDID":           str(resultID[n]).split('/')[2],
                       "ComicYear" :      str(resultYear[n]),
                       "ComicPublisher" : str(resultPublisher[n]),
                       "ComicURL" :       "http://www.comics.org" + str(resultID[n]),
                       "ComicIssues" :    str(resultIssues[n])
                      })
            #else:
                #print ( str(resultID[n]) + " already in DB...skipping" ) 
            n+=1
        cr+=1
    totalcount= totalcount + cnt
    comchoice['comchkchoice'] = comchkchoice
    return comchoice, totalcount

Beispiel #3

Datei anzeigen

Datei: parseit.py Projekt: partymike/mylar

def GCDScraper(ComicName, ComicYear, Total, ComicID):
    NOWyr = datetime.date.today().year
    comicnm = ComicName
    comicyr = ComicYear
    comicis = Total
    comicid = ComicID
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    #print ( "comichave: " + str(comicis) )
    #print ( "comicid: " + str(comicid) )
    comicnm = re.sub(' ', '%20', comicnm)
    #input = 'http://www.comics.org/series/name/' + str(comicnm) + '/sort/alpha/'
    input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&order2=date&order3=&start_date=' + str(comicyr) + '-01-01&end_date=' + str(NOWyr) + '-12-31&series=' + str(comicnm) + '&is_indexed=None'
    response = urllib2.urlopen ( input )
    soup = BeautifulSoup ( response)

    cnt1 = len(soup.findAll("tr", {"class" : "listing_even"}))
    cnt2 = len(soup.findAll("tr", {"class" : "listing_odd"}))

    cnt = int(cnt1 + cnt2)

    #print (str(cnt) + " results")

    global resultPublished

    resultName = []
    resultID = []
    resultYear = []
    resultIssues = []
    resultURL = None
    n_odd = -1
    n_even = -1
    n = 0
    while ( n < cnt ):
        if n%2==0:
            n_even+=1
            resultp = soup.findAll("tr", {"class" : "listing_even"})[n_even]
        else:
            n_odd+=1
            resultp = soup.findAll("tr", {"class" : "listing_odd"})[n_odd]
        rtp = resultp('a')[1]
        resultName.append(helpers.cleanName(rtp.findNext(text=True)))
        #print ( "Comic Name: " + str(resultName[n]) )
        fip = resultp('a',href=True)[1]
        resultID.append(fip['href'])
        #print ( "ID: " + str(resultID[n]) )

        subtxt3 = resultp('td')[3]
        resultYear.append(subtxt3.findNext(text=True))
        resultYear[n] = resultYear[n].replace(' ','')
        subtxt4 = resultp('td')[4]
        resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
        resiss = resultIssues[n].find('issue')
        resiss = int(resiss)
        resultIssues[n] = resultIssues[n].replace('','')[:resiss]
        resultIssues[n] = resultIssues[n].replace(' ','')
        #print ( "Year: " + str(resultYear[n]) )
        #print ( "Issues: " + str(resultIssues[n]) )
        if resultName[n].lower() == str(ComicName).lower(): 
            #print ("n:" + str(n) + "...matched by name to Mylar!")
            #this has been seen in a few instances already, so trying to adjust.
            #when the series year is 2011, in gcd it might be 2012 due to publication
            #dates overlapping between Dec/11 and Jan/12. Let's accept a match with a 
            #1 year grace space, and then pull in the first issue to see the actual pub
            # date and if coincides with the other date..match it.
            if resultYear[n] == ComicYear or resultYear[n] == str(int(ComicYear)+1): 
                #print ("n:" + str(n) + "...matched by year to Mylar!")
                #print ( "Year: " + str(resultYear[n]) )
                #Occasionally there are discrepancies in comic count between
                #GCD and CV. 99% it's CV not updating to the newest issue as fast
                #as GCD does. Therefore, let's increase the CV count by 1 to get it
                #to match, any more variation could cause incorrect matching.
                #ie. witchblade on GCD says 159 issues, CV states 161.
                if resultIssues[n] == Total or resultIssues[n] == str(int(Total)+1) or str(int(resultIssues[n])+1) == Total:
                    if resultIssues[n] == str(int(Total)+1):
                        issvariation = "cv"
                    elif str(int(resultIssues[n])+1) == Total:
                        issvariation = "gcd"
                    else:
                        issvariation = "no"
                        #print ("n:" + str(n) + "...matched by issues to Mylar!")
                        #print ("complete match!...proceeding")
                    TotalIssues = resultIssues[n]
                    resultURL = str(resultID[n])
                    rptxt = resultp('td')[6]
                    resultPublished = rptxt.findNext(text=True)
                    #print ("Series Published: " + str(resultPublished))
                    break
                
        n+=1
    # it's possible that comicvine would return a comic name incorrectly, or gcd
    # has the wrong title and won't match 100%...
    # (ie. The Flash-2011 on comicvine is Flash-2011 on gcd)
    if resultURL is None:
        if ComicName.startswith('The '):
            ComicName = ComicName[4:]
            return GCDScraper(ComicName, ComicYear, Total, ComicID)        
        if ':' in ComicName: 
            ComicName = re.sub(':', '', ComicName)
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if '-' in ComicName:
            ComicName = re.sub('-', ' ', ComicName)
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if 'and' in ComicName.lower():
            ComicName = ComicName.replace('and', '&')
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        return 'No Match'
    gcdinfo = {}
    gcdchoice = []

    input2 = 'http://www.comics.org' + str(resultURL) + 'details/'
    resp = urllib2.urlopen ( input2 )
    soup = BeautifulSoup ( resp )

    #for newer comics, on-sale date has complete date...
    #for older comics, pub.date is to be used

    type = soup.find(text=' On-sale date ')
    if type:
        #print ("on-sale date detected....adjusting")
        datetype = "on-sale"
    else:
        #print ("pub date defaulting")
        datetype = "pub"

    cnt1 = len(soup.findAll("tr", {"class" : "row_even_False"}))
    cnt2 = len(soup.findAll("tr", {"class" : "row_even_True"}))

    cnt = int(cnt1 + cnt2)

    #print (str(cnt) + " Issues in Total (this may be wrong due to alternate prints, etc")

    n_odd = -1
    n_even = -1
    n = 0
    PI = "1.00"
    altcount = 0
    while ( n < cnt ):       
        if n%2==0:
            n_odd+=1
            parsed = soup.findAll("tr", {"class" : "row_even_False"})[n_odd]
            ntype = "odd"
        else:
            n_even+=1
            ntype = "even"
            parsed = soup.findAll("tr", {"class" : "row_even_True"})[n_even]
        subtxt3 = parsed.find("a")
        ParseIssue = subtxt3.findNext(text=True)
        if ',' in ParseIssue: ParseIssue = re.sub("\,", "", ParseIssue)
        isslen = ParseIssue.find(' ')
        #if 'isslen' exists, it means that it's an alternative cover.
        #however, if ONLY alternate covers exist of an issue it won't work.
        #let's use the FIRST record, and ignore all other covers for the given issue.
        isschk = ParseIssue[:isslen]
        #check if decimal exists or not, and store decimal results
        if '.' in isschk:
            isschk_find = isschk.find('.')
            isschk_b4dec = isschk[:isschk_find]
            isschk_decval = isschk[isschk_find+1:]
        else:
            isschk_decval = ".00"

        if isslen > 0:
            isschk = ParseIssue[:isslen]
            isschk2 = str(isschk) + isschk_decval
            ParseIssue = str(isschk2)
            #print ("Alt.cover found = " + str(isschk2))
            if str(PI) == str(isschk2):
                if altcount == 0:
                    #this handles the first occurance..                    print ("Fist occurance detected - " + str(isschk))
                    ParseIssue = str(isschk2)
                    PI = str(isschk2)
                    altcount = 1
                else:
                    #print ("Using only first record for issue - ignoring further alternate matches")
                    ParseIssue = "this is wrong"
                    altcount+=1
            else:
                altcount = 1
                ParseIssue = str(isschk) + isschk_decval
        else:
            ParseIssue = ParseIssue + isschk_decval
            #print ("no alt.cover detected for - " + str(ParseIssue))
            altcount = 1
        if (altcount == 1):
            # in order to get the compare right, let's decimialize the string to '.00'.
            gcdinfo['ComicIssue'] = ParseIssue
            #print ( "Issue : " + str(ParseIssue) )
            #^^ will retrieve issue
            #if datetype == "on-sale":
            subtxt1 = parsed('td')[2]
            ParseDate = subtxt1.findNext(text=True)
            pdlen = len(ParseDate)
            #print ("Parsed Date length: " + str(pdlen))
            if len(ParseDate) < 7:
                subtxt1 = parsed.find("td")
                ParseDate = subtxt1.findNext(text=True)               
                if ParseDate == ' ':
                    ParseDate = "0000-00-00"
            ParseDate = ParseDate.replace(' ','')
            gcdinfo['ComicDate'] = ParseDate
            #print ( "Date : " + str(ParseDate) )
            #^^ will retrieve date #


            gcdchoice.append({
                'GCDid':                ComicID,
                'GCDIssue':             gcdinfo['ComicIssue'],
                'GCDDate':              gcdinfo['ComicDate']
                })

            gcdinfo['gcdchoice'] = gcdchoice
            PI = ParseIssue
        #else:
            # -- this needs a rework --
            # if issue only has alternative covers on comics.org, it won't match
            # and will cause the script to return a cannot retrieve..
            #compare previous issue to current issue (to help with alt.cover count)
         #   PI = ParseIssue
         #   altcount+=1
         #   print ("alternate issue - ignoring")
        #altcount = 0
        n+=1
    gcdinfo['gcdvariation'] = issvariation
    gcdinfo['totalissues'] = TotalIssues
    return gcdinfo

Beispiel #4

Datei anzeigen

Datei: parseit.py Projekt: yonkyunior/mylar

def annualCheck(gcomicid, comicid, comicname, comicyear):
    # will only work if we already matched for gcd.
    # search for <comicname> annual
    # grab annual listing that hits on comicyear (seriesyear)
    # grab results :)
    print ("GcomicID: " + str(gcomicid))
    print ("comicID: " + str(comicid))
    print ("comicname: " + comicname)
    print ("comicyear: " + str(comicyear))
    comicnm = comicname.encode('utf-8').strip()
    comicnm_1 = re.sub('\+', '%2B', comicnm + " annual")
    comicnm = re.sub(' ', '+', comicnm_1)
    input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&order2=date&order3=&start_date=' + str(comicyear) + '-01-01&end_date=' + str(comicyear) + '-12-31&series=' + str(comicnm) + '&is_indexed=None'

    response = urllib2.urlopen (input)
    soup = BeautifulSoup (response)
    cnt1 = len(soup.findAll("tr", {"class": "listing_even"}))
    cnt2 = len(soup.findAll("tr", {"class": "listing_odd"}))

    cnt = int(cnt1 + cnt2)

    print (str(cnt) + " results")

    resultName = []
    resultID = []
    resultYear = []
    resultIssues = []
    resultURL = None
    n_odd = -1
    n_even = -1
    n = 0
    while (n < cnt):
        if n%2==0:
            n_even+=1
            resultp = soup.findAll("tr", {"class": "listing_even"})[n_even]
        else:
            n_odd+=1
            resultp = soup.findAll("tr", {"class": "listing_odd"})[n_odd]
        rtp = resultp('a')[1]
        rtp1 = re.sub('Annual', '', rtp)
        resultName.append(helpers.cleanName(rtp1.findNext(text=True)))
        print ("Comic Name: " + str(resultName[n]))
        fip = resultp('a', href=True)[1]
        resultID.append(fip['href'])
        print ("ID: " + str(resultID[n]))

        subtxt3 = resultp('td')[3]
        resultYear.append(subtxt3.findNext(text=True))
        resultYear[n] = resultYear[n].replace(' ', '')

        subtxt4 = resultp('td')[4]
        resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
        resiss = resultIssues[n].find('issue')
        resiss = int(resiss)
        resultIssues[n] = resultIssues[n].replace('', '')[:resiss]
        resultIssues[n] = resultIssues[n].replace(' ', '')
        print ("Year: " + str(resultYear[n]))
        print ("Issues: " + str(resultIssues[n]))
        CleanComicName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', comicnm)

        CleanComicName = re.sub(' ', '', CleanComicName).lower()
        CleanResultName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', resultName[n])
        CleanResultName = re.sub(' ', '', CleanResultName).lower()
        print ("CleanComicName: " + str(CleanComicName))
        print ("CleanResultName: " + str(CleanResultName))
        if CleanResultName == CleanComicName or CleanResultName[3:] == CleanComicName:
        #if resultName[n].lower() == helpers.cleanName(str(ComicName)).lower():
            #print ("n:" + str(n) + "...matched by name to Mylar!")
            if resultYear[n] == ComicYear or resultYear[n] == str(int(ComicYear) +1):
                print ("n:" + str(n) + "...matched by year to Mylar!")
                print ("Year: " + str(resultYear[n]))
                TotalIssues = resultIssues[n]
                resultURL = str(resultID[n])
                rptxt = resultp('td')[6]
                resultPublished = rptxt.findNext(text=True)
                #print ("Series Published: " + str(resultPublished))
                break

        n+=1
    return

Beispiel #5

Datei anzeigen

Datei: parseit.py Projekt: yonkyunior/mylar

def ComChk(ComicName, ComicYear, ComicPublisher, Total, ComicID):
    comchkchoice = []
    comchoice = {}

    NOWyr = datetime.date.today().year
    if datetime.date.today().month == 12:
        NOWyr = NOWyr + 1
        logger.fdebug("We're in December, incremented search Year to increase search results: " + str(NOWyr))
    comicnm = ComicName.encode('utf-8').strip()
    comicyr = ComicYear
    comicis = Total
    comicid = ComicID
    comicpub = ComicPublisher.encode('utf-8').strip()
    #print ("...comchk parser initialization...")
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    #print ( "comichave: " + str(comicis) )
    #print ( "comicpub: " + str(comicpub) )
    #print ( "comicid: " + str(comicid) )
    # do 3 runs at the comics.org search to get the best results
    comicrun = []
    # &pub_name=DC
    # have to remove the spaces from Publisher or else will not work (ie. DC Comics vs DC will not match)
    # take the 1st word ;)
    #comicpub = comicpub.split()[0]
    # if it's not one of the BIG publisher's it might fail - so let's increase the odds.
    pubbiggies = ['DC',
                   'Marvel',
                   'Image',
                   'IDW']
    uhuh = "no"
    for pb in pubbiggies:
        if pb in comicpub:
            #keep publisher in url if a biggie.
            uhuh = "yes"
            #print (" publisher match : " + str(comicpub))
            conv_pub = comicpub.split()[0]
            #print (" converted publisher to : " + str(conv_pub))
    #1st run setup - leave it all as it is.
    comicrun.append(comicnm)
    cruncnt = 0
    #2nd run setup - remove the last character and do a broad search (keep year or else will blow up)
    if len(str(comicnm).split()) > 2:
        comicrun.append(' '.join(comicnm.split(' ')[:-1]))
        cruncnt+=1
    # to increase the likely hood of matches and to get a broader scope...
    # lets remove extra characters
    if re.sub('[\.\,\:]', '', comicnm) != comicnm:
        comicrun.append(re.sub('[\.\,\:]', '', comicnm))
        cruncnt+=1
    # one more addition - if the title contains a 'the', remove it ;)
    if comicnm.lower().startswith('the'):
        comicrun.append(comicnm[4:].strip())
        cruncnt+=1
    totalcount = 0
    cr = 0
    #print ("cruncnt is " + str(cruncnt))
    while (cr <= cruncnt):
        #print ("cr is " + str(cr))
        comicnm = comicrun[cr]
        #leaving spaces in will screw up the search...let's take care of it
        comicnm = re.sub(' ', '+', comicnm)
        #print ("comicnm: " + str(comicnm))
        if uhuh == "yes":
            publink = "&pub_name=" + str(conv_pub)
        if uhuh == "no":
            publink = "&pub_name="
        input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&keywords=&order1=series&order2=date&order3=&start_date=' + str(comicyr) + '-01-01&end_date=' + str(NOWyr) + '-12-31' + '&title=&feature=&job_number=&pages=&script=&pencils=&inks=&colors=&letters=&story_editing=&genre=&characters=&synopsis=&reprint_notes=&story_reprinted=None&notes=' + str(publink) + '&pub_notes=&brand=&brand_notes=&indicia_publisher=&is_surrogate=None&ind_pub_notes=&series=' + str(comicnm) + '&series_year_began=&series_notes=&tracking_notes=&issue_count=&is_comics=None&format=&color=&dimensions=&paper_stock=&binding=&publishing_format=&issues=&volume=&issue_title=&variant_name=&issue_date=&indicia_frequency=&price=&issue_pages=&issue_editing=&isbn=&barcode=&issue_notes=&issue_reprinted=None&is_indexed=None'
        response = urllib2.urlopen (input)
        soup = BeautifulSoup (response)
        cnt1 = len(soup.findAll("tr", {"class": "listing_even"}))
        cnt2 = len(soup.findAll("tr", {"class": "listing_odd"}))

        cnt = int(cnt1 + cnt2)
#        print ("cnt1: " + str(cnt1))
#        print ("cnt2: " + str(cnt2))
#        print (str(cnt) + " results")

        resultName = []
        resultID = []
        resultYear = []
        resultIssues = []
        resultPublisher = []
        resultURL = None
        n_odd = -1
        n_even = -1
        n = 0
        while (n < cnt):
            if n%2==0:
                n_even+=1
                resultp = soup.findAll("tr", {"class": "listing_even"})[n_even]
            else:
                n_odd+=1
                resultp = soup.findAll("tr", {"class": "listing_odd"})[n_odd]
            rtp = resultp('a')[1]
            rtpit = rtp.findNext(text=True)
            rtpthis = rtpit.encode('utf-8').strip()
            resultName.append(helpers.cleanName(rtpthis))
#            print ( "Comic Name: " + str(resultName[n]) )

            pub = resultp('a')[0]
            pubit = pub.findNext(text=True)
#            pubthis = u' '.join(pubit).encode('utf-8').strip()
            pubthis = pubit.encode('utf-8').strip()
            resultPublisher.append(pubthis)
#            print ( "Publisher: " + str(resultPublisher[n]) )

            fip = resultp('a', href=True)[1]
            resultID.append(fip['href'])
#            print ( "ID: " + str(resultID[n]) )

            subtxt3 = resultp('td')[3]
            resultYear.append(subtxt3.findNext(text=True))
            resultYear[n] = resultYear[n].replace(' ', '')
            subtxt4 = resultp('td')[4]
            resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
            resiss = resultIssues[n].find('issue')
            resiss = int(resiss)
            resultIssues[n] = resultIssues[n].replace('', '')[:resiss]
            resultIssues[n] = resultIssues[n].replace(' ', '')
#            print ( "Year: " + str(resultYear[n]) )
#            print ( "Issues: " + str(resultIssues[n]) )
#            print ("comchkchoice: " + str(comchkchoice))
            if not any(d.get('GCDID', None) == str(resultID[n]) for d in comchkchoice):
                #print ( str(resultID[n]) + " not in DB...adding.")
                comchkchoice.append({
                       "ComicID":         str(comicid),
                       "ComicName":       resultName[n],
                       "GCDID":           str(resultID[n]).split('/')[2],
                       "ComicYear":      str(resultYear[n]),
                       "ComicPublisher": resultPublisher[n],
                       "ComicURL":       "http://www.comics.org" + str(resultID[n]),
                       "ComicIssues":    str(resultIssues[n])
                      })
            #else:
                #print ( str(resultID[n]) + " already in DB...skipping" )
            n+=1
        cr+=1
    totalcount= totalcount + cnt
    comchoice['comchkchoice'] = comchkchoice
    return comchoice, totalcount

Beispiel #6

Datei anzeigen

Datei: parseit.py Projekt: yonkyunior/mylar

def GCDScraper(ComicName, ComicYear, Total, ComicID, quickmatch=None):
    NOWyr = datetime.date.today().year
    if datetime.date.today().month == 12:
        NOWyr = NOWyr + 1
        logger.fdebug("We're in December, incremented search Year to increase search results: " + str(NOWyr))
    comicnm = ComicName.encode('utf-8').strip()
    comicyr = ComicYear
    comicis = Total
    comicid = ComicID
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    #print ( "comichave: " + str(comicis) )
    #print ( "comicid: " + str(comicid) )
    comicnm_1 = re.sub('\+', '%2B', comicnm)
    comicnm = re.sub(' ', '+', comicnm_1)
    input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&order2=date&order3=&start_date=' + str(comicyr) + '-01-01&end_date=' + str(NOWyr) + '-12-31&series=' + str(comicnm) + '&is_indexed=None'
    response = urllib2.urlopen (input)
    soup = BeautifulSoup (response)
    cnt1 = len(soup.findAll("tr", {"class": "listing_even"}))
    cnt2 = len(soup.findAll("tr", {"class": "listing_odd"}))

    cnt = int(cnt1 + cnt2)

    #print (str(cnt) + " results")

    resultName = []
    resultID = []
    resultYear = []
    resultIssues = []
    resultURL = None
    n_odd = -1
    n_even = -1
    n = 0
    while (n < cnt):
        if n%2==0:
            n_even+=1
            resultp = soup.findAll("tr", {"class": "listing_even"})[n_even]
        else:
            n_odd+=1
            resultp = soup.findAll("tr", {"class": "listing_odd"})[n_odd]
        rtp = resultp('a')[1]
        resultName.append(helpers.cleanName(rtp.findNext(text=True)))
        #print ( "Comic Name: " + str(resultName[n]) )
        fip = resultp('a', href=True)[1]
        resultID.append(fip['href'])
        #print ( "ID: " + str(resultID[n]) )

        subtxt3 = resultp('td')[3]
        resultYear.append(subtxt3.findNext(text=True))
        resultYear[n] = resultYear[n].replace(' ', '')
        subtxt4 = resultp('td')[4]
        resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
        resiss = resultIssues[n].find('issue')
        resiss = int(resiss)
        resultIssues[n] = resultIssues[n].replace('', '')[:resiss]
        resultIssues[n] = resultIssues[n].replace(' ', '')
        #print ( "Year: " + str(resultYear[n]) )
        #print ( "Issues: " + str(resultIssues[n]) )
        CleanComicName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', comicnm)
        CleanComicName = re.sub(' ', '', CleanComicName).lower()
        CleanResultName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', resultName[n])
        CleanResultName = re.sub(' ', '', CleanResultName).lower()
        #print ("CleanComicName: " + str(CleanComicName))
        #print ("CleanResultName: " + str(CleanResultName))
        if CleanResultName == CleanComicName or CleanResultName[3:] == CleanComicName:
        #if resultName[n].lower() == helpers.cleanName(str(ComicName)).lower():
            #print ("n:" + str(n) + "...matched by name to Mylar!")
            #this has been seen in a few instances already, so trying to adjust.
            #when the series year is 2011, in gcd it might be 2012 due to publication
            #dates overlapping between Dec/11 and Jan/12. Let's accept a match with a
            #1 year grace space, and then pull in the first issue to see the actual pub
            # date and if coincides with the other date..match it.
            if resultYear[n] == ComicYear or resultYear[n] == str(int(ComicYear) +1):
                #print ("n:" + str(n) + "...matched by year to Mylar!")
                #print ( "Year: " + str(resultYear[n]) )
                #Occasionally there are discrepancies in comic count between
                #GCD and CV. 99% it's CV not updating to the newest issue as fast
                #as GCD does. Therefore, let's increase the CV count by 1 to get it
                #to match, any more variation could cause incorrect matching.
                #ie. witchblade on GCD says 159 issues, CV states 161.
                if int(resultIssues[n]) == int(Total) or int(resultIssues[n]) == int(Total) +1 or (int(resultIssues[n]) +1) == int(Total):
                    #print ("initial issue match..continuing.")
                    if int(resultIssues[n]) == int(Total) +1:
                        issvariation = "cv"
                    elif int(resultIssues[n]) +1 == int(Total):
                        issvariation = "gcd"
                    else:
                        issvariation = "no"
                        #print ("n:" + str(n) + "...matched by issues to Mylar!")
                        #print ("complete match!...proceeding")
                    TotalIssues = resultIssues[n]
                    resultURL = str(resultID[n])
                    rptxt = resultp('td')[6]
                    resultPublished = rptxt.findNext(text=True)
                    #print ("Series Published: " + str(resultPublished))
                    break

        n+=1
    # it's possible that comicvine would return a comic name incorrectly, or gcd
    # has the wrong title and won't match 100%...
    # (ie. The Flash-2011 on comicvine is Flash-2011 on gcd)
    # this section is to account for variations in spelling, punctuation, etc/
    basnumbs = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12}
    if resultURL is None:
        #search for number as text, and change to numeric
        for numbs in basnumbs:
            #print ("numbs:" + str(numbs))
            if numbs in ComicName.lower():
                numconv = basnumbs[numbs]
                #print ("numconv: " + str(numconv))
                ComicNm = re.sub(str(numbs), str(numconv), ComicName.lower())
                #print ("comicname-reVISED:" + str(ComicNm))
                return GCDScraper(ComicNm, ComicYear, Total, ComicID)
                break
        if ComicName.lower().startswith('the '):
            ComicName = ComicName[4:]
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if ':' in ComicName:
            ComicName = re.sub(':', '', ComicName)
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if '-' in ComicName:
            ComicName = re.sub('-', ' ', ComicName)
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if 'and' in ComicName.lower():
            ComicName = ComicName.replace('and', '&')
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if not quickmatch: return 'No Match'
    #vari_loop = 0
    if quickmatch == "yes":
        if resultURL is None: return 'No Match'
        else: return 'Match'
    return GCDdetails(comseries=None, resultURL=resultURL, vari_loop=0, ComicID=ComicID, TotalIssues=TotalIssues, issvariation=issvariation, resultPublished=resultPublished)

Beispiel #7

Datei anzeigen

Datei: parseit.py Projekt: Nobrumski/mylar

def GCDScraper(ComicName, ComicYear, Total, ComicID):
    comicnm = ComicName
    comicyr = ComicYear
    comicis = Total
    comicid = ComicID
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    #print ( "comichave: " + str(comicis) )
    #print ( "comicid: " + str(comicid) )
    comicnm = re.sub(' ', '%20', comicnm)
    input = 'http://www.comics.org/series/name/' + str(comicnm) + '/sort/alpha/'
    response = urllib2.urlopen ( input )
    soup = BeautifulSoup ( response)

    cnt1 = len(soup.findAll("tr", {"class" : "listing_even"}))
    cnt2 = len(soup.findAll("tr", {"class" : "listing_odd"}))

    cnt = int(cnt1 + cnt2)

    #print (str(cnt) + " results")

    global resultPublished

    resultName = []
    resultID = []
    resultYear = []
    resultIssues = []
    resultURL = None
    n_odd = -1
    n_even = -1
    n = 0
    while ( n < cnt ):
        if n%2==0:
            n_even+=1
            resultp = soup.findAll("tr", {"class" : "listing_even"})[n_even]
        else:
            n_odd+=1
            resultp = soup.findAll("tr", {"class" : "listing_odd"})[n_odd]
        rtp = resultp('a')[1]
        resultName.append(helpers.cleanName(rtp.findNext(text=True)))
        #print ( "Comic Name: " + str(resultName[n]) )
        fip = resultp('a',href=True)[1]
        resultID.append(fip['href'])
        print ( "ID: " + str(resultID[n]) )

        subtxt3 = resultp('td')[3]
        resultYear.append(subtxt3.findNext(text=True))
        resultYear[n] = resultYear[n].replace(' ','')
        subtxt4 = resultp('td')[4]
        resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
        resiss = resultIssues[n].find('issue')
        resiss = int(resiss)
        resultIssues[n] = resultIssues[n].replace('','')[:resiss]
        resultIssues[n] = resultIssues[n].replace(' ','')
        #print ( "Year: " + str(resultYear[n]) )
        #print ( "Issues: " + str(resultIssues[n]) )

        if resultName[n].lower() == str(ComicName).lower(): 
            #print ("n:" + str(n) + "...matched by name to Mylar!")
            if resultYear[n] == ComicYear: 
                #print ("n:" + str(n) + "...matched by year to Mylar!")
                #Occasionally there are discrepancies in comic count between
                #GCD and CV. 99% it's CV not updating to the newest issue as fast
                #as GCD does. Therefore, let's increase the CV count by 1 to get it
                #to match, any more variation could cause incorrect matching.
                if resultIssues[n] == Total or resultIssues[n] == str(int(Total)+1): 
                    if resultIssues[n] == str(int(Total)+1):
                        issvariation = "yes"
                    else:
                        issvariation = "no"
                    #print ("n:" + str(n) + "...matched by issues to Mylar!")
                    #print ("complete match!...proceeding")
                    resultURL = str(resultID[n])
                    rptxt = resultp('td')[6]
                    resultPublished = rptxt.findNext(text=True)
                    #print ("Series Published: " + str(resultPublished))
                    break
                
        n+=1
    # it's possible that comicvine would return a comic name incorrectly, or gcd
    # has the wrong title and won't match 100%...
    # (ie. The Flash-2011 on comicvine is Flash-2011 on gcd)
    if resultURL is None:
        print ("comicnm:" + str(ComicName))
        if ComicName.startswith('The '):
            #print ("No match found - detected The in title...performing deeper analysis")
            ComicName = ComicName[4:]
            return GCDScraper(ComicName, ComicYear, Total, ComicID)        
        else:
            #print ("no match found...cannot proceed.")
            return 'No Match'
    gcdinfo = {}
    gcdchoice = []

    input2 = 'http://www.comics.org' + str(resultURL) + 'details/'
    resp = urllib2.urlopen ( input2 )
    soup = BeautifulSoup ( resp )

    #for newer comics, on-sale date has complete date...
    #for older comics, pub.date is to be used

    type = soup.find(text=' On-sale date ')
    if type:
        #print ("on-sale date detected....adjusting")
        datetype = "on-sale"
    else:
        #print ("pub date defaulting")
        datetype = "pub"

    cnt1 = len(soup.findAll("tr", {"class" : "row_even_False"}))
    cnt2 = len(soup.findAll("tr", {"class" : "row_even_True"}))

    cnt = int(cnt1 + cnt2)

    #print (str(cnt) + " Issues in Total (this may be wrong due to alternate prints, etc")

    n_odd = -1
    n_even = -1
    n = 0
    PI = "1.00"
    altcount = 0
    while ( n < cnt ):       
        if n%2==0:
            n_odd+=1
            parsed = soup.findAll("tr", {"class" : "row_even_False"})[n_odd]
            ntype = "odd"
        else:
            n_even+=1
            ntype = "even"
            parsed = soup.findAll("tr", {"class" : "row_even_True"})[n_even]
        subtxt3 = parsed.find("a")
        ParseIssue = subtxt3.findNext(text=True)
        isslen = ParseIssue.find(' ')
        #if 'isslen' exists, it means that it's an alternative cover.
        #however, if ONLY alternate covers exist of an issue it won't work.
        #let's use the FIRST record, and ignore all other covers for the given issue.
        if isslen > 0:
            isschk = ParseIssue[:isslen]
            isschk2 = str(isschk) + ".00"
            ParseIssue = str(isschk2)
            #print ("Alt.cover found = " + str(isschk2))
            if str(PI) == str(isschk2):
                if altcount == 0:
                    #this handles the first occurance..                    print ("Fist occurance detected - " + str(isschk))
                    ParseIssue = str(isschk2)
                    PI = str(isschk2)
                    altcount = 1
                else:
                    #print ("Using only first record for issue - ignoring further alternate matches")
                    ParseIssue = "this is wrong"
                    altcount+=1
            else:
                altcount = 1
                ParseIssue = str(isschk) + ".00"
        else:
            ParseIssue = ParseIssue + ".00"          
            #print ("no alt.cover detected for - " + str(ParseIssue))
            altcount = 1
        if (altcount == 1):
            # in order to get the compare right, let's decimialize the string to '.00'.
            gcdinfo['ComicIssue'] = ParseIssue
            #print ( "Issue : " + str(ParseIssue) )
            #^^ will retrieve issue
            #if datetype == "on-sale":
            subtxt1 = parsed('td')[2]
            ParseDate = subtxt1.findNext(text=True)
            pdlen = len(ParseDate)
            #print ("Parsed Date length: " + str(pdlen))
            if len(ParseDate) < 7:
                subtxt1 = parsed.find("td")
                ParseDate = subtxt1.findNext(text=True)               
                if ParseDate == ' ':
                    ParseDate = "0000-00-00"
            ParseDate = ParseDate.replace(' ','')
            gcdinfo['ComicDate'] = ParseDate
            #print ( "Date : " + str(ParseDate) )
            #^^ will retrieve date #


            gcdchoice.append({
                'GCDid':                ComicID,
                'GCDIssue':             gcdinfo['ComicIssue'],
                'GCDDate':              gcdinfo['ComicDate']
                })

            gcdinfo['gcdchoice'] = gcdchoice
            PI = ParseIssue
        #else:
            # -- this needs a rework --
            # if issue only has alternative covers on comics.org, it won't match
            # and will cause the script to return a cannot retrieve..
            #compare previous issue to current issue (to help with alt.cover count)
         #   PI = ParseIssue
         #   altcount+=1
         #   print ("alternate issue - ignoring")
        #altcount = 0
        n+=1
    gcdinfo['gcdvariation'] = issvariation
    return gcdinfo