Esempio n. 1
0
def parse_searchpage(page, artist=None, album=None, id_field=ALBUM_ID):
    """Parses a search page and gets relevant info.


    Arguments:
    page -- html string with search page's html.
    artist -- artist to to check for in results. If found only results
              with that artist are returned.
    album -- album to check for in results. If found only results with
             with the album are returned.
    id_field -- key to use for the album id found.

    Return a tuple with the first element being == True if the list
    was truncated with only matching artist/albums.
    
    """
    page = get_encoding(page, True, 'utf8')[1]
    soup = parse_html.SoupWrapper(parse_html.parse(page))
    result_table = soup.find('ul', {'class': 'search-results'})
    try:
        results = result_table.find_all('div', {'class': 'info'})
    except AttributeError:
        return []

    albums = [parse_search_element(result) for result in results]

    d = {}
    if artist and album:
        d = {'artist': artist, 'album': album}
        top = [album for album in albums if equal(d, album, True)]
    elif album:
        d = {'album': album}
        top = [album for album in albums if equal(d, album, True, ['album'])]
        if not top:
            top = [
                album for album in albums if equal(d, album, False, ['album'])
            ]
    elif artist:
        d = {'artist': artist}
        top = [album for album in albums if equal(d, album, True, ['artist'])]
        if not ret:
            top = [
                album for album in albums if equal(d, album, False, ['artist'])
            ]
    else:
        top = []

    return False, albums
Esempio n. 2
0
def parse_searchpage(page, artist=None, album=None, id_field=ALBUM_ID):
    """Parses a search page and gets relevant info.


    Arguments:
    page -- html string with search page's html.
    artist -- artist to to check for in results. If found only results
              with that artist are returned.
    album -- album to check for in results. If found only results with
             with the album are returned.
    id_field -- key to use for the album id found.

    Return a tuple with the first element being == True if the list
    was truncated with only matching artist/albums.
    
    """
    page = get_encoding(page, True, 'utf8')[1]
    soup = parse_html.SoupWrapper(parse_html.parse(page))
    result_table = soup.find('ul', {'class': 'search-results'})
    try:
        results = result_table.find_all('div',
            {'class': 'info'})
    except AttributeError:
        return []

    albums = [parse_search_element(result) for result in results]

    d = {}
    if artist and album:
        d = {'artist': artist, 'album': album}
        top = [album for album in albums if equal(d, album, True)]
    elif album:
        d = {'album': album}
        top = [album for album in albums if equal(d, album, True, ['album'])]
        if not top:
            top = [album for album in albums if 
                equal(d, album, False, ['album'])]
    elif artist:
        d = {'artist': artist}
        top = [album for album in albums if equal(d, album, True, ['artist'])]
        if not ret:
            top = [album for album in albums if
                equal(d, album, False, ['artist'])]
    else:
        top = []
    
    return False, albums
Esempio n. 3
0
def parse_albumpage(page, artist=None, album=None):

    info = {}

    album_soup = parse_html.SoupWrapper(parse_html.parse(page))

    artist = album_soup.find('div', {'class': 'album-artist'})
    album = album_soup.find('div', {'class': 'album-title'})

    release_title = album_soup.find('h3', 'release-title')

    if release_title:
        album = release_title
        details = album_soup.find('p', {'class': 'release-details'})
        if details:
            info['release'] = convert(details.string)

    if not artist:
        artist = album_soup.find('h3', 'release-artist')

    if album is None:
        info.update({'artist': convert(artist.string), 'album': ''})
    else:
        info.update({
            'artist': convert(artist.string),
            'album': convert(album.string)
        })
    info['albumartist'] = info['artist']

    sidebar = album_soup.find('div', {'class': 'sidebar'})
    info.update(parse_sidebar(sidebar))
    info.update(convert_year(info))

    content = album_soup.find('section', {'class': 'review read-more'})
    if content:
        info.update(parse_review(content))

    #swipe = main.find('div', {'id':"similar-albums", 'class':"grid-gallery"})

    #info.update(parse_similar(swipe))

    info = dict(
        (spanmap.get(k, k), v) for k, v in info.iteritems() if not isempty(v))

    return [info, parse_tracks(album_soup, info)]
Esempio n. 4
0
def load_data(files):
	musics = []
	for idx, f in enumerate(files):
		music = parse_html.parse(f)
		chords = music.chords
		keys = music.keys
		dat = []
		for chord in chords:
			new_chords = conv2cof.to_cof(chord, keys)
			x = []
			y = []
			for chord in new_chords:
				x.append(chord[0]) 
				y.append(chord[1])
			if len(x) < 3:
				print x
			dat.append([x, y])
		music.dat = dat
		musics.append(music)
	return musics
Esempio n. 5
0
def parse_albumpage(page, artist=None, album=None):

    info = {}

    album_soup = parse_html.SoupWrapper(parse_html.parse(page))

    artist = album_soup.find('div', {'class': 'album-artist'})
    album = album_soup.find('div', {'class': 'album-title'})

    release_title = album_soup.find('h3', 'release-title')
    
    if release_title:
        album = release_title
        details = album_soup.find('p', {'class': 'release-details'})
        if details:
            info['release'] = convert(details.string)

    if not artist:
        artist = album_soup.find('h3', 'release-artist')

    if  album is None:
        info.update({'artist': convert(artist.string), 'album': ''})
    else:
        info.update({'artist': convert(artist.string), 'album': convert(album.string)})
    info['albumartist'] = info['artist']

    sidebar = album_soup.find('div', {'class': 'sidebar'})
    info.update(parse_sidebar(sidebar))
    info.update(convert_year(info))

    content = album_soup.find('section', {'class': 'review read-more'})
    if content:
        info.update(parse_review(content))

    #swipe = main.find('div', {'id':"similar-albums", 'class':"grid-gallery"})
    
    #info.update(parse_similar(swipe))
    
    info = dict((spanmap.get(k,k),v) for k, v in info.iteritems() if not isempty(v))
        
    return [info, parse_tracks(album_soup, info)]
baseURL = "https://www.indeed.com/jobs?q=&l=Atlanta%2C+GA&filter={}start={}"
filterNumbers = list(range(0, 10))
startNumbers = list(range(0, 1000, 10))
minSleepTime = 1.00
randomRange = 1.50

if __name__ == "__main__":
    data = DataModel.load(model_name)

    compoundNumber = [(filterNumber, startNumber)
                      for filterNumber in filterNumbers
                      for startNumber in startNumbers]
    print("Request Count:", len(compoundNumber))

    for filterNumber, startNumber in tqdm(compoundNumber):
        url = baseURL.format(filterNumber, startNumber)
        print("Request URL: ", url)

        response = requests.get(url)

        items = parse(response.text)

        for item in items:
            data.add(item)

        randomSleep = minSleepTime + random.random() * randomRange
        print("Sleep for: ", randomSleep, "s")
        time.sleep(randomSleep)

    DataModel.save(data, model_name)
def data_process(url):
    output_str = u"""
    <!DOCTYPE html>
    <html>
    <head>
    <style>
    table {
                width:100%;
                }
    
    th, td {
                padding: 5px;
                    text-align: left;
                    }
    table tr:nth-child(even) {
                background-color: #eee;
                }
    table tr:nth-child(odd) {
               background-color:#fff;
               }
    table th {
                background-color: black;
                    color: white;
                    }
    </style>
    </head>
    <body>
    """
    
    for value in parse(url):
        result_dict = {}
        words = value.split()

        for word in words:
            if word in word_dict:
                for val in word_dict[word]:
                    if val in result_dict:
                        result_dict[val] +=1
                    else:
                        result_dict[val] = 1
        best_matches = sorted(result_dict.items(),key=lambda x:x[1],reverse=True)[:3]
        output_str += "The arabic input is: <br/>"
        output_str += value + "<br/>"
        output_str += "The best matches in decreasing order:<br/>"
        output_str += "<table style=\"width:100%\"><tr><th>Sura</th><th>Aya</th><th>Match(%)</th></tr>"
        for index,match in enumerate(best_matches):
            output_str += u"<tr><td>{}</td><td>{}</td>".format(dic_in[u'sura'][best_matches[index][0][0]][u'name'], best_matches[index][0][1]+1) 
            best = dic_in['sura'][best_matches[index][0][0]][u'aya'][best_matches[index][0][1]][u'text'].split()
            count = 0.0
            for input_word,compare_word in zip(words,best):
                if input_word == compare_word:
                    count += 1.0
                else:
                    count += zaro_calculate(input_word, compare_word)
            output_str += "<td>{}</td></tr>".format(count/len(words)*100)
        output_str += "</table>"
        
    output_str += "</body></html>"

        #print type(result_dict.items()[0])
    return output_str
from parse_html import parse
from zaro_winkler import zaro_calculate
import json

url = 'http://abuaminaelias.com/brotherhood-in-the-quran-and-sunnah/'

with open('output.pickle', 'rb') as data:
    word_dict = pickle.load(data)

with open('output.json','r') as json_data:
    dic_in = json.load(json_data)


matchCases = {}

for value in parse(url):
    #print "---------------------------->"
    result_dict = {}
    words = value.split()
    #print len(words)

    for word in words:
        if word in word_dict:
            for val in word_dict[word]:
                if val in result_dict:
                    result_dict[val] +=1
                else:
                    result_dict[val] = 1

    #matchCases[value] = sorted(result_dict.items(),key=lambda x:x[1],reverse=True)[:3]
    best_results = sorted(result_dict.items(),key=lambda x:x[1],reverse=True)[:3]
Esempio n. 9
0
def data_process(url):
    output_str = u"""
    <!DOCTYPE html>
    <html>
    <head>
    <style>
    table {
                width:100%;
                }
    
    th, td {
                padding: 5px;
                    text-align: left;
                    }
    table tr:nth-child(even) {
                background-color: #eee;
                }
    table tr:nth-child(odd) {
               background-color:#fff;
               }
    table th {
                background-color: black;
                    color: white;
                    }
    </style>
    </head>
    <body>
    """

    for value in parse(url):
        result_dict = {}
        words = value.split()

        for word in words:
            if word in word_dict:
                for val in word_dict[word]:
                    if val in result_dict:
                        result_dict[val] += 1
                    else:
                        result_dict[val] = 1
        best_matches = sorted(result_dict.items(),
                              key=lambda x: x[1],
                              reverse=True)[:3]
        output_str += "The arabic input is: <br/>"
        output_str += value + "<br/>"
        output_str += "The best matches in decreasing order:<br/>"
        output_str += "<table style=\"width:100%\"><tr><th>Sura</th><th>Aya</th><th>Match(%)</th></tr>"
        for index, match in enumerate(best_matches):
            output_str += u"<tr><td>{}</td><td>{}</td>".format(
                dic_in[u'sura'][best_matches[index][0][0]][u'name'],
                best_matches[index][0][1] + 1)
            best = dic_in['sura'][best_matches[index][0][0]][u'aya'][
                best_matches[index][0][1]][u'text'].split()
            count = 0.0
            for input_word, compare_word in zip(words, best):
                if input_word == compare_word:
                    count += 1.0
                else:
                    count += zaro_calculate(input_word, compare_word)
            output_str += "<td>{}</td></tr>".format(count / len(words) * 100)
        output_str += "</table>"

    output_str += "</body></html>"

    #print type(result_dict.items()[0])
    return output_str
Esempio n. 10
0
from parse_html import parse
from zaro_winkler import zaro_calculate
import json

url = 'http://abuaminaelias.com/brotherhood-in-the-quran-and-sunnah/'

with open('output.pickle', 'rb') as data:
    word_dict = pickle.load(data)

with open('output.json', 'r') as json_data:
    dic_in = json.load(json_data)

matchCases = {}

for value in parse(url):
    #print "---------------------------->"
    result_dict = {}
    words = value.split()
    #print len(words)

    for word in words:
        if word in word_dict:
            for val in word_dict[word]:
                if val in result_dict:
                    result_dict[val] += 1
                else:
                    result_dict[val] = 1

    #matchCases[value] = sorted(result_dict.items(),key=lambda x:x[1],reverse=True)[:3]
    best_results = sorted(result_dict.items(),