Python cleanName Examples

Programming Language: Python

Namespace/Package Name: helpers

Method/Function: cleanName

Examples at hotexamples.com: 7

Python cleanName - 7 examples found. These are the top rated real world Python examples of helpers.cleanName extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: parseit.py Project: brunnels/mylar

def GCDScraper(ComicName, ComicYear, Total, ComicID, quickmatch=None):
    NOWyr = datetime.date.today().year
    if datetime.date.today().month == 12:
        NOWyr = NOWyr + 1
        logger.fdebug("We're in December, incremented search Year to increase search results: " + str(NOWyr))
    comicnm = ComicName
    comicyr = ComicYear
    comicis = Total
    comicid = ComicID
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    #print ( "comichave: " + str(comicis) )
    #print ( "comicid: " + str(comicid) )
    comicnm = re.sub(' ', '+', comicnm)
    input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&order2=date&order3=&start_date=' + str(comicyr) + '-01-01&end_date=' + str(NOWyr) + '-12-31&series=' + str(comicnm) + '&is_indexed=None'
    response = urllib2.urlopen ( input )
    soup = BeautifulSoup ( response)
    cnt1 = len(soup.findAll("tr", {"class" : "listing_even"}))
    cnt2 = len(soup.findAll("tr", {"class" : "listing_odd"}))

    cnt = int(cnt1 + cnt2)

    #print (str(cnt) + " results")

    resultName = []
    resultID = []
    resultYear = []
    resultIssues = []
    resultURL = None
    n_odd = -1
    n_even = -1
    n = 0
    while ( n < cnt ):
        if n%2==0:
            n_even+=1
            resultp = soup.findAll("tr", {"class" : "listing_even"})[n_even]
        else:
            n_odd+=1
            resultp = soup.findAll("tr", {"class" : "listing_odd"})[n_odd]
        rtp = resultp('a')[1]
        resultName.append(helpers.cleanName(rtp.findNext(text=True)))
        #print ( "Comic Name: " + str(resultName[n]) )
        fip = resultp('a',href=True)[1]
        resultID.append(fip['href'])
        #print ( "ID: " + str(resultID[n]) )

        subtxt3 = resultp('td')[3]
        resultYear.append(subtxt3.findNext(text=True))
        resultYear[n] = resultYear[n].replace(' ','')
        subtxt4 = resultp('td')[4]
        resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
        resiss = resultIssues[n].find('issue')
        resiss = int(resiss)
        resultIssues[n] = resultIssues[n].replace('','')[:resiss]
        resultIssues[n] = resultIssues[n].replace(' ','')
        #print ( "Year: " + str(resultYear[n]) )
        #print ( "Issues: " + str(resultIssues[n]) )
        CleanComicName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', comicnm)
        CleanComicName = re.sub(' ', '', CleanComicName).lower()
        CleanResultName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', resultName[n])        
        CleanResultName = re.sub(' ', '', CleanResultName).lower()
        #print ("CleanComicName: " + str(CleanComicName))
        #print ("CleanResultName: " + str(CleanResultName))
        if CleanResultName == CleanComicName or CleanResultName[3:] == CleanComicName:
        #if resultName[n].lower() == helpers.cleanName(str(ComicName)).lower(): 
            #print ("n:" + str(n) + "...matched by name to Mylar!")
            #this has been seen in a few instances already, so trying to adjust.
            #when the series year is 2011, in gcd it might be 2012 due to publication
            #dates overlapping between Dec/11 and Jan/12. Let's accept a match with a 
            #1 year grace space, and then pull in the first issue to see the actual pub
            # date and if coincides with the other date..match it.
            if resultYear[n] == ComicYear or resultYear[n] == str(int(ComicYear)+1): 
                #print ("n:" + str(n) + "...matched by year to Mylar!")
                #print ( "Year: " + str(resultYear[n]) )
                #Occasionally there are discrepancies in comic count between
                #GCD and CV. 99% it's CV not updating to the newest issue as fast
                #as GCD does. Therefore, let's increase the CV count by 1 to get it
                #to match, any more variation could cause incorrect matching.
                #ie. witchblade on GCD says 159 issues, CV states 161.
                if int(resultIssues[n]) == int(Total) or int(resultIssues[n]) == int(Total)+1 or (int(resultIssues[n])+1) == int(Total):
                    #print ("initial issue match..continuing.")
                    if int(resultIssues[n]) == int(Total)+1:
                        issvariation = "cv"
                    elif int(resultIssues[n])+1 == int(Total):
                        issvariation = "gcd"
                    else:
                        issvariation = "no"
                        #print ("n:" + str(n) + "...matched by issues to Mylar!")
                        #print ("complete match!...proceeding")
                    TotalIssues = resultIssues[n]
                    resultURL = str(resultID[n])
                    rptxt = resultp('td')[6]
                    resultPublished = rptxt.findNext(text=True)
                    #print ("Series Published: " + str(resultPublished))
                    break
                
        n+=1
    # it's possible that comicvine would return a comic name incorrectly, or gcd
    # has the wrong title and won't match 100%...
    # (ie. The Flash-2011 on comicvine is Flash-2011 on gcd)
    # this section is to account for variations in spelling, punctuation, etc/
    basnumbs = {'one':1,'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9,'ten':10,'eleven':11,'twelve':12}
    if resultURL is None:
        #search for number as text, and change to numeric
        for numbs in basnumbs:
            #print ("numbs:" + str(numbs))
            if numbs in ComicName.lower():
                numconv = basnumbs[numbs]
                #print ("numconv: " + str(numconv))
                ComicNm = re.sub(str(numbs), str(numconv), ComicName.lower())
                #print ("comicname-reVISED:" + str(ComicNm))
                return GCDScraper(ComicNm, ComicYear, Total, ComicID)
                break
        if ComicName.lower().startswith('the '):
            ComicName = ComicName[4:]
            return GCDScraper(ComicName, ComicYear, Total, ComicID)        
        if ':' in ComicName: 
            ComicName = re.sub(':', '', ComicName)
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if '-' in ComicName:
            ComicName = re.sub('-', ' ', ComicName)
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if 'and' in ComicName.lower():
            ComicName = ComicName.replace('and', '&')
            return GCDScraper(ComicName, ComicYear, Total, ComicID)        
        if not quickmatch: return 'No Match'
    #vari_loop = 0
    if quickmatch == "yes":
        if resultURL is None: return 'No Match'
        else: return 'Match'
    return GCDdetails(comseries=None, resultURL=resultURL, vari_loop=0, ComicID=ComicID, TotalIssues=TotalIssues, issvariation=issvariation, resultPublished=resultPublished)

Example #2

Show file

File: parseit.py Project: brunnels/mylar

def ComChk(ComicName, ComicYear, ComicPublisher, Total, ComicID):
    comchkchoice = []
    comchoice = {}

    NOWyr = datetime.date.today().year
    if datetime.date.today().month == 12:
        NOWyr = NOWyr + 1
        logger.fdebug("We're in December, incremented search Year to increase search results: " + str(NOWyr))
    comicnm = ComicName
    comicyr = ComicYear
    comicis = Total
    comicid = ComicID
    comicpub = ComicPublisher
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    #print ( "comichave: " + str(comicis) )
    #print ( "comicpub: " + str(comicpub) )
    #print ( "comicid: " + str(comicid) )
    # do 3 runs at the comics.org search to get the best results
    comicrun = []
    # &pub_name=DC
    # have to remove the spaces from Publisher or else will not work (ie. DC Comics vs DC will not match)
    # take the 1st word ;)
    #comicpub = comicpub.split()[0]
    # if it's not one of the BIG publisher's it might fail - so let's increase the odds.
    pubbiggies = [ 'DC', 
                   'Marvel',
                   'Image',
                   'IDW' ]
    uhuh = "no"
    for pb in pubbiggies:
        if pb in comicpub:
            #keep publisher in url if a biggie.    
            uhuh = "yes"
            #print (" publisher match : " + str(comicpub))
            conv_pub = comicpub.split()[0]
            #print (" converted publisher to : " + str(conv_pub))
    #1st run setup - leave it all as it is.
    comicrun.append(comicnm)
    cruncnt = 0
    #2nd run setup - remove the last character and do a broad search (keep year or else will blow up)
    if len(str(comicnm).split()) > 2:
        comicrun.append(' '.join(comicnm.split(' ')[:-1]))
        cruncnt+=1
    # to increase the likely hood of matches and to get a broader scope...
    # lets remove extra characters
    if re.sub('[\.\,\:]', '', comicnm) != comicnm:
        comicrun.append(re.sub('[\.\,\:]', '', comicnm))
        cruncnt+=1
    totalcount = 0
    cr = 0
    #print ("cruncnt is " + str(cruncnt))
    while (cr <= cruncnt):
        #print ("cr is " + str(cr))
        comicnm = comicrun[cr]
        #leaving spaces in will screw up the search...let's take care of it
        comicnm = re.sub(' ', '+', comicnm)
        #print ("comicnm: " + str(comicnm))
        if uhuh == "yes":
            publink = "&pub_name=" + str(conv_pub)
        if uhuh == "no":
            publink = "&pub_name="
        input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&keywords=&order1=series&order2=date&order3=&start_date=' + str(comicyr) + '-01-01&end_date=' + str(NOWyr) + '-12-31' + '&title=&feature=&job_number=&pages=&script=&pencils=&inks=&colors=&letters=&story_editing=&genre=&characters=&synopsis=&reprint_notes=&story_reprinted=None&notes=' + str(publink) + '&pub_notes=&brand=&brand_notes=&indicia_publisher=&is_surrogate=None&ind_pub_notes=&series=' + str(comicnm) + '&series_year_began=&series_notes=&tracking_notes=&issue_count=&is_comics=None&format=&color=&dimensions=&paper_stock=&binding=&publishing_format=&issues=&volume=&issue_title=&variant_name=&issue_date=&indicia_frequency=&price=&issue_pages=&issue_editing=&isbn=&barcode=&issue_notes=&issue_reprinted=None&is_indexed=None'
        response = urllib2.urlopen ( input )
        soup = BeautifulSoup ( response)
        cnt1 = len(soup.findAll("tr", {"class" : "listing_even"}))
        cnt2 = len(soup.findAll("tr", {"class" : "listing_odd"}))

        cnt = int(cnt1 + cnt2)
#        print ("cnt1: " + str(cnt1))
#        print ("cnt2: " + str(cnt2))
#        print (str(cnt) + " results")

        resultName = []
        resultID = []
        resultYear = []
        resultIssues = []
        resultPublisher = []
        resultURL = None
        n_odd = -1
        n_even = -1
        n = 0
        while ( n < cnt ):
            if n%2==0:
                n_even+=1
                resultp = soup.findAll("tr", {"class" : "listing_even"})[n_even]
            else:
                n_odd+=1
                resultp = soup.findAll("tr", {"class" : "listing_odd"})[n_odd]
            rtp = resultp('a')[1]
            resultName.append(helpers.cleanName(rtp.findNext(text=True)))
#            print ( "Comic Name: " + str(resultName[n]) )

            pub = resultp('a')[0]
            resultPublisher.append(pub.findNext(text=True))
#            print ( "Publisher: " + str(resultPublisher[n]) )

            fip = resultp('a',href=True)[1]
            resultID.append(fip['href'])
#            print ( "ID: " + str(resultID[n]) )

            subtxt3 = resultp('td')[3]
            resultYear.append(subtxt3.findNext(text=True))
            resultYear[n] = resultYear[n].replace(' ','')
            subtxt4 = resultp('td')[4]
            resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
            resiss = resultIssues[n].find('issue')
            resiss = int(resiss)
            resultIssues[n] = resultIssues[n].replace('','')[:resiss]
            resultIssues[n] = resultIssues[n].replace(' ','')
#            print ( "Year: " + str(resultYear[n]) )
#            print ( "Issues: " + str(resultIssues[n]) )
#            print ("comchkchoice: " + str(comchkchoice))
            if not any(d.get('GCDID', None) == str(resultID[n]) for d in comchkchoice):
                #print ( str(resultID[n]) + " not in DB...adding.")
                comchkchoice.append({
                       "ComicID":         str(comicid),
                       "ComicName":       str(resultName[n]),
                       "GCDID":           str(resultID[n]).split('/')[2],
                       "ComicYear" :      str(resultYear[n]),
                       "ComicPublisher" : str(resultPublisher[n]),
                       "ComicURL" :       "http://www.comics.org" + str(resultID[n]),
                       "ComicIssues" :    str(resultIssues[n])
                      })
            #else:
                #print ( str(resultID[n]) + " already in DB...skipping" ) 
            n+=1
        cr+=1
    totalcount= totalcount + cnt
    comchoice['comchkchoice'] = comchkchoice
    return comchoice, totalcount

Example #3

Show file

File: parseit.py Project: partymike/mylar

def GCDScraper(ComicName, ComicYear, Total, ComicID):
    NOWyr = datetime.date.today().year
    comicnm = ComicName
    comicyr = ComicYear
    comicis = Total
    comicid = ComicID
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    #print ( "comichave: " + str(comicis) )
    #print ( "comicid: " + str(comicid) )
    comicnm = re.sub(' ', '%20', comicnm)
    #input = 'http://www.comics.org/series/name/' + str(comicnm) + '/sort/alpha/'
    input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&order2=date&order3=&start_date=' + str(comicyr) + '-01-01&end_date=' + str(NOWyr) + '-12-31&series=' + str(comicnm) + '&is_indexed=None'
    response = urllib2.urlopen ( input )
    soup = BeautifulSoup ( response)

    cnt1 = len(soup.findAll("tr", {"class" : "listing_even"}))
    cnt2 = len(soup.findAll("tr", {"class" : "listing_odd"}))

    cnt = int(cnt1 + cnt2)

    #print (str(cnt) + " results")

    global resultPublished

    resultName = []
    resultID = []
    resultYear = []
    resultIssues = []
    resultURL = None
    n_odd = -1
    n_even = -1
    n = 0
    while ( n < cnt ):
        if n%2==0:
            n_even+=1
            resultp = soup.findAll("tr", {"class" : "listing_even"})[n_even]
        else:
            n_odd+=1
            resultp = soup.findAll("tr", {"class" : "listing_odd"})[n_odd]
        rtp = resultp('a')[1]
        resultName.append(helpers.cleanName(rtp.findNext(text=True)))
        #print ( "Comic Name: " + str(resultName[n]) )
        fip = resultp('a',href=True)[1]
        resultID.append(fip['href'])
        #print ( "ID: " + str(resultID[n]) )

        subtxt3 = resultp('td')[3]
        resultYear.append(subtxt3.findNext(text=True))
        resultYear[n] = resultYear[n].replace(' ','')
        subtxt4 = resultp('td')[4]
        resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
        resiss = resultIssues[n].find('issue')
        resiss = int(resiss)
        resultIssues[n] = resultIssues[n].replace('','')[:resiss]
        resultIssues[n] = resultIssues[n].replace(' ','')
        #print ( "Year: " + str(resultYear[n]) )
        #print ( "Issues: " + str(resultIssues[n]) )
        if resultName[n].lower() == str(ComicName).lower(): 
            #print ("n:" + str(n) + "...matched by name to Mylar!")
            #this has been seen in a few instances already, so trying to adjust.
            #when the series year is 2011, in gcd it might be 2012 due to publication
            #dates overlapping between Dec/11 and Jan/12. Let's accept a match with a 
            #1 year grace space, and then pull in the first issue to see the actual pub
            # date and if coincides with the other date..match it.
            if resultYear[n] == ComicYear or resultYear[n] == str(int(ComicYear)+1): 
                #print ("n:" + str(n) + "...matched by year to Mylar!")
                #print ( "Year: " + str(resultYear[n]) )
                #Occasionally there are discrepancies in comic count between
                #GCD and CV. 99% it's CV not updating to the newest issue as fast
                #as GCD does. Therefore, let's increase the CV count by 1 to get it
                #to match, any more variation could cause incorrect matching.
                #ie. witchblade on GCD says 159 issues, CV states 161.
                if resultIssues[n] == Total or resultIssues[n] == str(int(Total)+1) or str(int(resultIssues[n])+1) == Total:
                    if resultIssues[n] == str(int(Total)+1):
                        issvariation = "cv"
                    elif str(int(resultIssues[n])+1) == Total:
                        issvariation = "gcd"
                    else:
                        issvariation = "no"
                        #print ("n:" + str(n) + "...matched by issues to Mylar!")
                        #print ("complete match!...proceeding")
                    TotalIssues = resultIssues[n]
                    resultURL = str(resultID[n])
                    rptxt = resultp('td')[6]
                    resultPublished = rptxt.findNext(text=True)
                    #print ("Series Published: " + str(resultPublished))
                    break
                
        n+=1
    # it's possible that comicvine would return a comic name incorrectly, or gcd
    # has the wrong title and won't match 100%...
    # (ie. The Flash-2011 on comicvine is Flash-2011 on gcd)
    if resultURL is None:
        if ComicName.startswith('The '):
            ComicName = ComicName[4:]
            return GCDScraper(ComicName, ComicYear, Total, ComicID)        
        if ':' in ComicName: 
            ComicName = re.sub(':', '', ComicName)
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if '-' in ComicName:
            ComicName = re.sub('-', ' ', ComicName)
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if 'and' in ComicName.lower():
            ComicName = ComicName.replace('and', '&')
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        return 'No Match'
    gcdinfo = {}
    gcdchoice = []

    input2 = 'http://www.comics.org' + str(resultURL) + 'details/'
    resp = urllib2.urlopen ( input2 )
    soup = BeautifulSoup ( resp )

    #for newer comics, on-sale date has complete date...
    #for older comics, pub.date is to be used

    type = soup.find(text=' On-sale date ')
    if type:
        #print ("on-sale date detected....adjusting")
        datetype = "on-sale"
    else:
        #print ("pub date defaulting")
        datetype = "pub"

    cnt1 = len(soup.findAll("tr", {"class" : "row_even_False"}))
    cnt2 = len(soup.findAll("tr", {"class" : "row_even_True"}))

    cnt = int(cnt1 + cnt2)

    #print (str(cnt) + " Issues in Total (this may be wrong due to alternate prints, etc")

    n_odd = -1
    n_even = -1
    n = 0
    PI = "1.00"
    altcount = 0
    while ( n < cnt ):       
        if n%2==0:
            n_odd+=1
            parsed = soup.findAll("tr", {"class" : "row_even_False"})[n_odd]
            ntype = "odd"
        else:
            n_even+=1
            ntype = "even"
            parsed = soup.findAll("tr", {"class" : "row_even_True"})[n_even]
        subtxt3 = parsed.find("a")
        ParseIssue = subtxt3.findNext(text=True)
        if ',' in ParseIssue: ParseIssue = re.sub("\,", "", ParseIssue)
        isslen = ParseIssue.find(' ')
        #if 'isslen' exists, it means that it's an alternative cover.
        #however, if ONLY alternate covers exist of an issue it won't work.
        #let's use the FIRST record, and ignore all other covers for the given issue.
        isschk = ParseIssue[:isslen]
        #check if decimal exists or not, and store decimal results
        if '.' in isschk:
            isschk_find = isschk.find('.')
            isschk_b4dec = isschk[:isschk_find]
            isschk_decval = isschk[isschk_find+1:]
        else:
            isschk_decval = ".00"

        if isslen > 0:
            isschk = ParseIssue[:isslen]
            isschk2 = str(isschk) + isschk_decval
            ParseIssue = str(isschk2)
            #print ("Alt.cover found = " + str(isschk2))
            if str(PI) == str(isschk2):
                if altcount == 0:
                    #this handles the first occurance..                    print ("Fist occurance detected - " + str(isschk))
                    ParseIssue = str(isschk2)
                    PI = str(isschk2)
                    altcount = 1
                else:
                    #print ("Using only first record for issue - ignoring further alternate matches")
                    ParseIssue = "this is wrong"
                    altcount+=1
            else:
                altcount = 1
                ParseIssue = str(isschk) + isschk_decval
        else:
            ParseIssue = ParseIssue + isschk_decval
            #print ("no alt.cover detected for - " + str(ParseIssue))
            altcount = 1
        if (altcount == 1):
            # in order to get the compare right, let's decimialize the string to '.00'.
            gcdinfo['ComicIssue'] = ParseIssue
            #print ( "Issue : " + str(ParseIssue) )
            #^^ will retrieve issue
            #if datetype == "on-sale":
            subtxt1 = parsed('td')[2]
            ParseDate = subtxt1.findNext(text=True)
            pdlen = len(ParseDate)
            #print ("Parsed Date length: " + str(pdlen))
            if len(ParseDate) < 7:
                subtxt1 = parsed.find("td")
                ParseDate = subtxt1.findNext(text=True)               
                if ParseDate == ' ':
                    ParseDate = "0000-00-00"
            ParseDate = ParseDate.replace(' ','')
            gcdinfo['ComicDate'] = ParseDate
            #print ( "Date : " + str(ParseDate) )
            #^^ will retrieve date #


            gcdchoice.append({
                'GCDid':                ComicID,
                'GCDIssue':             gcdinfo['ComicIssue'],
                'GCDDate':              gcdinfo['ComicDate']
                })

            gcdinfo['gcdchoice'] = gcdchoice
            PI = ParseIssue
        #else:
            # -- this needs a rework --
            # if issue only has alternative covers on comics.org, it won't match
            # and will cause the script to return a cannot retrieve..
            #compare previous issue to current issue (to help with alt.cover count)
         #   PI = ParseIssue
         #   altcount+=1
         #   print ("alternate issue - ignoring")
        #altcount = 0
        n+=1
    gcdinfo['gcdvariation'] = issvariation
    gcdinfo['totalissues'] = TotalIssues
    return gcdinfo

Example #4

Show file

File: parseit.py Project: yonkyunior/mylar

def annualCheck(gcomicid, comicid, comicname, comicyear):
    # will only work if we already matched for gcd.
    # search for <comicname> annual
    # grab annual listing that hits on comicyear (seriesyear)
    # grab results :)
    print ("GcomicID: " + str(gcomicid))
    print ("comicID: " + str(comicid))
    print ("comicname: " + comicname)
    print ("comicyear: " + str(comicyear))
    comicnm = comicname.encode('utf-8').strip()
    comicnm_1 = re.sub('\+', '%2B', comicnm + " annual")
    comicnm = re.sub(' ', '+', comicnm_1)
    input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&order2=date&order3=&start_date=' + str(comicyear) + '-01-01&end_date=' + str(comicyear) + '-12-31&series=' + str(comicnm) + '&is_indexed=None'

    response = urllib2.urlopen (input)
    soup = BeautifulSoup (response)
    cnt1 = len(soup.findAll("tr", {"class": "listing_even"}))
    cnt2 = len(soup.findAll("tr", {"class": "listing_odd"}))

    cnt = int(cnt1 + cnt2)

    print (str(cnt) + " results")

    resultName = []
    resultID = []
    resultYear = []
    resultIssues = []
    resultURL = None
    n_odd = -1
    n_even = -1
    n = 0
    while (n < cnt):
        if n%2==0:
            n_even+=1
            resultp = soup.findAll("tr", {"class": "listing_even"})[n_even]
        else:
            n_odd+=1
            resultp = soup.findAll("tr", {"class": "listing_odd"})[n_odd]
        rtp = resultp('a')[1]
        rtp1 = re.sub('Annual', '', rtp)
        resultName.append(helpers.cleanName(rtp1.findNext(text=True)))
        print ("Comic Name: " + str(resultName[n]))
        fip = resultp('a', href=True)[1]
        resultID.append(fip['href'])
        print ("ID: " + str(resultID[n]))

        subtxt3 = resultp('td')[3]
        resultYear.append(subtxt3.findNext(text=True))
        resultYear[n] = resultYear[n].replace(' ', '')

        subtxt4 = resultp('td')[4]
        resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
        resiss = resultIssues[n].find('issue')
        resiss = int(resiss)
        resultIssues[n] = resultIssues[n].replace('', '')[:resiss]
        resultIssues[n] = resultIssues[n].replace(' ', '')
        print ("Year: " + str(resultYear[n]))
        print ("Issues: " + str(resultIssues[n]))
        CleanComicName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', comicnm)

        CleanComicName = re.sub(' ', '', CleanComicName).lower()
        CleanResultName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', resultName[n])
        CleanResultName = re.sub(' ', '', CleanResultName).lower()
        print ("CleanComicName: " + str(CleanComicName))
        print ("CleanResultName: " + str(CleanResultName))
        if CleanResultName == CleanComicName or CleanResultName[3:] == CleanComicName:
        #if resultName[n].lower() == helpers.cleanName(str(ComicName)).lower():
            #print ("n:" + str(n) + "...matched by name to Mylar!")
            if resultYear[n] == ComicYear or resultYear[n] == str(int(ComicYear) +1):
                print ("n:" + str(n) + "...matched by year to Mylar!")
                print ("Year: " + str(resultYear[n]))
                TotalIssues = resultIssues[n]
                resultURL = str(resultID[n])
                rptxt = resultp('td')[6]
                resultPublished = rptxt.findNext(text=True)
                #print ("Series Published: " + str(resultPublished))
                break

        n+=1
    return

Example #5

Show file

File: parseit.py Project: yonkyunior/mylar

def ComChk(ComicName, ComicYear, ComicPublisher, Total, ComicID):
    comchkchoice = []
    comchoice = {}

    NOWyr = datetime.date.today().year
    if datetime.date.today().month == 12:
        NOWyr = NOWyr + 1
        logger.fdebug("We're in December, incremented search Year to increase search results: " + str(NOWyr))
    comicnm = ComicName.encode('utf-8').strip()
    comicyr = ComicYear
    comicis = Total
    comicid = ComicID
    comicpub = ComicPublisher.encode('utf-8').strip()
    #print ("...comchk parser initialization...")
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    #print ( "comichave: " + str(comicis) )
    #print ( "comicpub: " + str(comicpub) )
    #print ( "comicid: " + str(comicid) )
    # do 3 runs at the comics.org search to get the best results
    comicrun = []
    # &pub_name=DC
    # have to remove the spaces from Publisher or else will not work (ie. DC Comics vs DC will not match)
    # take the 1st word ;)
    #comicpub = comicpub.split()[0]
    # if it's not one of the BIG publisher's it might fail - so let's increase the odds.
    pubbiggies = ['DC',
                   'Marvel',
                   'Image',
                   'IDW']
    uhuh = "no"
    for pb in pubbiggies:
        if pb in comicpub:
            #keep publisher in url if a biggie.
            uhuh = "yes"
            #print (" publisher match : " + str(comicpub))
            conv_pub = comicpub.split()[0]
            #print (" converted publisher to : " + str(conv_pub))
    #1st run setup - leave it all as it is.
    comicrun.append(comicnm)
    cruncnt = 0
    #2nd run setup - remove the last character and do a broad search (keep year or else will blow up)
    if len(str(comicnm).split()) > 2:
        comicrun.append(' '.join(comicnm.split(' ')[:-1]))
        cruncnt+=1
    # to increase the likely hood of matches and to get a broader scope...
    # lets remove extra characters
    if re.sub('[\.\,\:]', '', comicnm) != comicnm:
        comicrun.append(re.sub('[\.\,\:]', '', comicnm))
        cruncnt+=1
    # one more addition - if the title contains a 'the', remove it ;)
    if comicnm.lower().startswith('the'):
        comicrun.append(comicnm[4:].strip())
        cruncnt+=1
    totalcount = 0
    cr = 0
    #print ("cruncnt is " + str(cruncnt))
    while (cr <= cruncnt):
        #print ("cr is " + str(cr))
        comicnm = comicrun[cr]
        #leaving spaces in will screw up the search...let's take care of it
        comicnm = re.sub(' ', '+', comicnm)
        #print ("comicnm: " + str(comicnm))
        if uhuh == "yes":
            publink = "&pub_name=" + str(conv_pub)
        if uhuh == "no":
            publink = "&pub_name="
        input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&keywords=&order1=series&order2=date&order3=&start_date=' + str(comicyr) + '-01-01&end_date=' + str(NOWyr) + '-12-31' + '&title=&feature=&job_number=&pages=&script=&pencils=&inks=&colors=&letters=&story_editing=&genre=&characters=&synopsis=&reprint_notes=&story_reprinted=None&notes=' + str(publink) + '&pub_notes=&brand=&brand_notes=&indicia_publisher=&is_surrogate=None&ind_pub_notes=&series=' + str(comicnm) + '&series_year_began=&series_notes=&tracking_notes=&issue_count=&is_comics=None&format=&color=&dimensions=&paper_stock=&binding=&publishing_format=&issues=&volume=&issue_title=&variant_name=&issue_date=&indicia_frequency=&price=&issue_pages=&issue_editing=&isbn=&barcode=&issue_notes=&issue_reprinted=None&is_indexed=None'
        response = urllib2.urlopen (input)
        soup = BeautifulSoup (response)
        cnt1 = len(soup.findAll("tr", {"class": "listing_even"}))
        cnt2 = len(soup.findAll("tr", {"class": "listing_odd"}))

        cnt = int(cnt1 + cnt2)
#        print ("cnt1: " + str(cnt1))
#        print ("cnt2: " + str(cnt2))
#        print (str(cnt) + " results")

        resultName = []
        resultID = []
        resultYear = []
        resultIssues = []
        resultPublisher = []
        resultURL = None
        n_odd = -1
        n_even = -1
        n = 0
        while (n < cnt):
            if n%2==0:
                n_even+=1
                resultp = soup.findAll("tr", {"class": "listing_even"})[n_even]
            else:
                n_odd+=1
                resultp = soup.findAll("tr", {"class": "listing_odd"})[n_odd]
            rtp = resultp('a')[1]
            rtpit = rtp.findNext(text=True)
            rtpthis = rtpit.encode('utf-8').strip()
            resultName.append(helpers.cleanName(rtpthis))
#            print ( "Comic Name: " + str(resultName[n]) )

            pub = resultp('a')[0]
            pubit = pub.findNext(text=True)
#            pubthis = u' '.join(pubit).encode('utf-8').strip()
            pubthis = pubit.encode('utf-8').strip()
            resultPublisher.append(pubthis)
#            print ( "Publisher: " + str(resultPublisher[n]) )

            fip = resultp('a', href=True)[1]
            resultID.append(fip['href'])
#            print ( "ID: " + str(resultID[n]) )

            subtxt3 = resultp('td')[3]
            resultYear.append(subtxt3.findNext(text=True))
            resultYear[n] = resultYear[n].replace(' ', '')
            subtxt4 = resultp('td')[4]
            resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
            resiss = resultIssues[n].find('issue')
            resiss = int(resiss)
            resultIssues[n] = resultIssues[n].replace('', '')[:resiss]
            resultIssues[n] = resultIssues[n].replace(' ', '')
#            print ( "Year: " + str(resultYear[n]) )
#            print ( "Issues: " + str(resultIssues[n]) )
#            print ("comchkchoice: " + str(comchkchoice))
            if not any(d.get('GCDID', None) == str(resultID[n]) for d in comchkchoice):
                #print ( str(resultID[n]) + " not in DB...adding.")
                comchkchoice.append({
                       "ComicID":         str(comicid),
                       "ComicName":       resultName[n],
                       "GCDID":           str(resultID[n]).split('/')[2],
                       "ComicYear":      str(resultYear[n]),
                       "ComicPublisher": resultPublisher[n],
                       "ComicURL":       "http://www.comics.org" + str(resultID[n]),
                       "ComicIssues":    str(resultIssues[n])
                      })
            #else:
                #print ( str(resultID[n]) + " already in DB...skipping" )
            n+=1
        cr+=1
    totalcount= totalcount + cnt
    comchoice['comchkchoice'] = comchkchoice
    return comchoice, totalcount

Example #6

Show file

File: parseit.py Project: yonkyunior/mylar

def GCDScraper(ComicName, ComicYear, Total, ComicID, quickmatch=None):
    NOWyr = datetime.date.today().year
    if datetime.date.today().month == 12:
        NOWyr = NOWyr + 1
        logger.fdebug("We're in December, incremented search Year to increase search results: " + str(NOWyr))
    comicnm = ComicName.encode('utf-8').strip()
    comicyr = ComicYear
    comicis = Total
    comicid = ComicID
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    #print ( "comichave: " + str(comicis) )
    #print ( "comicid: " + str(comicid) )
    comicnm_1 = re.sub('\+', '%2B', comicnm)
    comicnm = re.sub(' ', '+', comicnm_1)
    input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&order2=date&order3=&start_date=' + str(comicyr) + '-01-01&end_date=' + str(NOWyr) + '-12-31&series=' + str(comicnm) + '&is_indexed=None'
    response = urllib2.urlopen (input)
    soup = BeautifulSoup (response)
    cnt1 = len(soup.findAll("tr", {"class": "listing_even"}))
    cnt2 = len(soup.findAll("tr", {"class": "listing_odd"}))

    cnt = int(cnt1 + cnt2)

    #print (str(cnt) + " results")

    resultName = []
    resultID = []
    resultYear = []
    resultIssues = []
    resultURL = None
    n_odd = -1
    n_even = -1
    n = 0
    while (n < cnt):
        if n%2==0:
            n_even+=1
            resultp = soup.findAll("tr", {"class": "listing_even"})[n_even]
        else:
            n_odd+=1
            resultp = soup.findAll("tr", {"class": "listing_odd"})[n_odd]
        rtp = resultp('a')[1]
        resultName.append(helpers.cleanName(rtp.findNext(text=True)))
        #print ( "Comic Name: " + str(resultName[n]) )
        fip = resultp('a', href=True)[1]
        resultID.append(fip['href'])
        #print ( "ID: " + str(resultID[n]) )

        subtxt3 = resultp('td')[3]
        resultYear.append(subtxt3.findNext(text=True))
        resultYear[n] = resultYear[n].replace(' ', '')
        subtxt4 = resultp('td')[4]
        resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
        resiss = resultIssues[n].find('issue')
        resiss = int(resiss)
        resultIssues[n] = resultIssues[n].replace('', '')[:resiss]
        resultIssues[n] = resultIssues[n].replace(' ', '')
        #print ( "Year: " + str(resultYear[n]) )
        #print ( "Issues: " + str(resultIssues[n]) )
        CleanComicName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', comicnm)
        CleanComicName = re.sub(' ', '', CleanComicName).lower()
        CleanResultName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', resultName[n])
        CleanResultName = re.sub(' ', '', CleanResultName).lower()
        #print ("CleanComicName: " + str(CleanComicName))
        #print ("CleanResultName: " + str(CleanResultName))
        if CleanResultName == CleanComicName or CleanResultName[3:] == CleanComicName:
        #if resultName[n].lower() == helpers.cleanName(str(ComicName)).lower():
            #print ("n:" + str(n) + "...matched by name to Mylar!")
            #this has been seen in a few instances already, so trying to adjust.
            #when the series year is 2011, in gcd it might be 2012 due to publication
            #dates overlapping between Dec/11 and Jan/12. Let's accept a match with a
            #1 year grace space, and then pull in the first issue to see the actual pub
            # date and if coincides with the other date..match it.
            if resultYear[n] == ComicYear or resultYear[n] == str(int(ComicYear) +1):
                #print ("n:" + str(n) + "...matched by year to Mylar!")
                #print ( "Year: " + str(resultYear[n]) )
                #Occasionally there are discrepancies in comic count between
                #GCD and CV. 99% it's CV not updating to the newest issue as fast
                #as GCD does. Therefore, let's increase the CV count by 1 to get it
                #to match, any more variation could cause incorrect matching.
                #ie. witchblade on GCD says 159 issues, CV states 161.
                if int(resultIssues[n]) == int(Total) or int(resultIssues[n]) == int(Total) +1 or (int(resultIssues[n]) +1) == int(Total):
                    #print ("initial issue match..continuing.")
                    if int(resultIssues[n]) == int(Total) +1:
                        issvariation = "cv"
                    elif int(resultIssues[n]) +1 == int(Total):
                        issvariation = "gcd"
                    else:
                        issvariation = "no"
                        #print ("n:" + str(n) + "...matched by issues to Mylar!")
                        #print ("complete match!...proceeding")
                    TotalIssues = resultIssues[n]
                    resultURL = str(resultID[n])
                    rptxt = resultp('td')[6]
                    resultPublished = rptxt.findNext(text=True)
                    #print ("Series Published: " + str(resultPublished))
                    break

        n+=1
    # it's possible that comicvine would return a comic name incorrectly, or gcd
    # has the wrong title and won't match 100%...
    # (ie. The Flash-2011 on comicvine is Flash-2011 on gcd)
    # this section is to account for variations in spelling, punctuation, etc/
    basnumbs = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12}
    if resultURL is None:
        #search for number as text, and change to numeric
        for numbs in basnumbs:
            #print ("numbs:" + str(numbs))
            if numbs in ComicName.lower():
                numconv = basnumbs[numbs]
                #print ("numconv: " + str(numconv))
                ComicNm = re.sub(str(numbs), str(numconv), ComicName.lower())
                #print ("comicname-reVISED:" + str(ComicNm))
                return GCDScraper(ComicNm, ComicYear, Total, ComicID)
                break
        if ComicName.lower().startswith('the '):
            ComicName = ComicName[4:]
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if ':' in ComicName:
            ComicName = re.sub(':', '', ComicName)
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if '-' in ComicName:
            ComicName = re.sub('-', ' ', ComicName)
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if 'and' in ComicName.lower():
            ComicName = ComicName.replace('and', '&')
            return GCDScraper(ComicName, ComicYear, Total, ComicID)
        if not quickmatch: return 'No Match'
    #vari_loop = 0
    if quickmatch == "yes":
        if resultURL is None: return 'No Match'
        else: return 'Match'
    return GCDdetails(comseries=None, resultURL=resultURL, vari_loop=0, ComicID=ComicID, TotalIssues=TotalIssues, issvariation=issvariation, resultPublished=resultPublished)

Example #7

Show file

File: parseit.py Project: Nobrumski/mylar

def GCDScraper(ComicName, ComicYear, Total, ComicID):
    comicnm = ComicName
    comicyr = ComicYear
    comicis = Total
    comicid = ComicID
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    #print ( "comichave: " + str(comicis) )
    #print ( "comicid: " + str(comicid) )
    comicnm = re.sub(' ', '%20', comicnm)
    input = 'http://www.comics.org/series/name/' + str(comicnm) + '/sort/alpha/'
    response = urllib2.urlopen ( input )
    soup = BeautifulSoup ( response)

    cnt1 = len(soup.findAll("tr", {"class" : "listing_even"}))
    cnt2 = len(soup.findAll("tr", {"class" : "listing_odd"}))

    cnt = int(cnt1 + cnt2)

    #print (str(cnt) + " results")

    global resultPublished

    resultName = []
    resultID = []
    resultYear = []
    resultIssues = []
    resultURL = None
    n_odd = -1
    n_even = -1
    n = 0
    while ( n < cnt ):
        if n%2==0:
            n_even+=1
            resultp = soup.findAll("tr", {"class" : "listing_even"})[n_even]
        else:
            n_odd+=1
            resultp = soup.findAll("tr", {"class" : "listing_odd"})[n_odd]
        rtp = resultp('a')[1]
        resultName.append(helpers.cleanName(rtp.findNext(text=True)))
        #print ( "Comic Name: " + str(resultName[n]) )
        fip = resultp('a',href=True)[1]
        resultID.append(fip['href'])
        print ( "ID: " + str(resultID[n]) )

        subtxt3 = resultp('td')[3]
        resultYear.append(subtxt3.findNext(text=True))
        resultYear[n] = resultYear[n].replace(' ','')
        subtxt4 = resultp('td')[4]
        resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
        resiss = resultIssues[n].find('issue')
        resiss = int(resiss)
        resultIssues[n] = resultIssues[n].replace('','')[:resiss]
        resultIssues[n] = resultIssues[n].replace(' ','')
        #print ( "Year: " + str(resultYear[n]) )
        #print ( "Issues: " + str(resultIssues[n]) )

        if resultName[n].lower() == str(ComicName).lower(): 
            #print ("n:" + str(n) + "...matched by name to Mylar!")
            if resultYear[n] == ComicYear: 
                #print ("n:" + str(n) + "...matched by year to Mylar!")
                #Occasionally there are discrepancies in comic count between
                #GCD and CV. 99% it's CV not updating to the newest issue as fast
                #as GCD does. Therefore, let's increase the CV count by 1 to get it
                #to match, any more variation could cause incorrect matching.
                if resultIssues[n] == Total or resultIssues[n] == str(int(Total)+1): 
                    if resultIssues[n] == str(int(Total)+1):
                        issvariation = "yes"
                    else:
                        issvariation = "no"
                    #print ("n:" + str(n) + "...matched by issues to Mylar!")
                    #print ("complete match!...proceeding")
                    resultURL = str(resultID[n])
                    rptxt = resultp('td')[6]
                    resultPublished = rptxt.findNext(text=True)
                    #print ("Series Published: " + str(resultPublished))
                    break
                
        n+=1
    # it's possible that comicvine would return a comic name incorrectly, or gcd
    # has the wrong title and won't match 100%...
    # (ie. The Flash-2011 on comicvine is Flash-2011 on gcd)
    if resultURL is None:
        print ("comicnm:" + str(ComicName))
        if ComicName.startswith('The '):
            #print ("No match found - detected The in title...performing deeper analysis")
            ComicName = ComicName[4:]
            return GCDScraper(ComicName, ComicYear, Total, ComicID)        
        else:
            #print ("no match found...cannot proceed.")
            return 'No Match'
    gcdinfo = {}
    gcdchoice = []

    input2 = 'http://www.comics.org' + str(resultURL) + 'details/'
    resp = urllib2.urlopen ( input2 )
    soup = BeautifulSoup ( resp )

    #for newer comics, on-sale date has complete date...
    #for older comics, pub.date is to be used

    type = soup.find(text=' On-sale date ')
    if type:
        #print ("on-sale date detected....adjusting")
        datetype = "on-sale"
    else:
        #print ("pub date defaulting")
        datetype = "pub"

    cnt1 = len(soup.findAll("tr", {"class" : "row_even_False"}))
    cnt2 = len(soup.findAll("tr", {"class" : "row_even_True"}))

    cnt = int(cnt1 + cnt2)

    #print (str(cnt) + " Issues in Total (this may be wrong due to alternate prints, etc")

    n_odd = -1
    n_even = -1
    n = 0
    PI = "1.00"
    altcount = 0
    while ( n < cnt ):       
        if n%2==0:
            n_odd+=1
            parsed = soup.findAll("tr", {"class" : "row_even_False"})[n_odd]
            ntype = "odd"
        else:
            n_even+=1
            ntype = "even"
            parsed = soup.findAll("tr", {"class" : "row_even_True"})[n_even]
        subtxt3 = parsed.find("a")
        ParseIssue = subtxt3.findNext(text=True)
        isslen = ParseIssue.find(' ')
        #if 'isslen' exists, it means that it's an alternative cover.
        #however, if ONLY alternate covers exist of an issue it won't work.
        #let's use the FIRST record, and ignore all other covers for the given issue.
        if isslen > 0:
            isschk = ParseIssue[:isslen]
            isschk2 = str(isschk) + ".00"
            ParseIssue = str(isschk2)
            #print ("Alt.cover found = " + str(isschk2))
            if str(PI) == str(isschk2):
                if altcount == 0:
                    #this handles the first occurance..                    print ("Fist occurance detected - " + str(isschk))
                    ParseIssue = str(isschk2)
                    PI = str(isschk2)
                    altcount = 1
                else:
                    #print ("Using only first record for issue - ignoring further alternate matches")
                    ParseIssue = "this is wrong"
                    altcount+=1
            else:
                altcount = 1
                ParseIssue = str(isschk) + ".00"
        else:
            ParseIssue = ParseIssue + ".00"          
            #print ("no alt.cover detected for - " + str(ParseIssue))
            altcount = 1
        if (altcount == 1):
            # in order to get the compare right, let's decimialize the string to '.00'.
            gcdinfo['ComicIssue'] = ParseIssue
            #print ( "Issue : " + str(ParseIssue) )
            #^^ will retrieve issue
            #if datetype == "on-sale":
            subtxt1 = parsed('td')[2]
            ParseDate = subtxt1.findNext(text=True)
            pdlen = len(ParseDate)
            #print ("Parsed Date length: " + str(pdlen))
            if len(ParseDate) < 7:
                subtxt1 = parsed.find("td")
                ParseDate = subtxt1.findNext(text=True)               
                if ParseDate == ' ':
                    ParseDate = "0000-00-00"
            ParseDate = ParseDate.replace(' ','')
            gcdinfo['ComicDate'] = ParseDate
            #print ( "Date : " + str(ParseDate) )
            #^^ will retrieve date #


            gcdchoice.append({
                'GCDid':                ComicID,
                'GCDIssue':             gcdinfo['ComicIssue'],
                'GCDDate':              gcdinfo['ComicDate']
                })

            gcdinfo['gcdchoice'] = gcdchoice
            PI = ParseIssue
        #else:
            # -- this needs a rework --
            # if issue only has alternative covers on comics.org, it won't match
            # and will cause the script to return a cannot retrieve..
            #compare previous issue to current issue (to help with alt.cover count)
         #   PI = ParseIssue
         #   altcount+=1
         #   print ("alternate issue - ignoring")
        #altcount = 0
        n+=1
    gcdinfo['gcdvariation'] = issvariation
    return gcdinfo