def parse_searchpage(page, artist=None, album=None, id_field=ALBUM_ID): """Parses a search page and gets relevant info. Arguments: page -- html string with search page's html. artist -- artist to to check for in results. If found only results with that artist are returned. album -- album to check for in results. If found only results with with the album are returned. id_field -- key to use for the album id found. Return a tuple with the first element being == True if the list was truncated with only matching artist/albums. """ page = get_encoding(page, True, 'utf8')[1] soup = parse_html.SoupWrapper(parse_html.parse(page)) result_table = soup.find('ul', {'class': 'search-results'}) try: results = result_table.find_all('div', {'class': 'info'}) except AttributeError: return [] albums = [parse_search_element(result) for result in results] d = {} if artist and album: d = {'artist': artist, 'album': album} top = [album for album in albums if equal(d, album, True)] elif album: d = {'album': album} top = [album for album in albums if equal(d, album, True, ['album'])] if not top: top = [ album for album in albums if equal(d, album, False, ['album']) ] elif artist: d = {'artist': artist} top = [album for album in albums if equal(d, album, True, ['artist'])] if not ret: top = [ album for album in albums if equal(d, album, False, ['artist']) ] else: top = [] return False, albums
def parse_searchpage(page, artist=None, album=None, id_field=ALBUM_ID): """Parses a search page and gets relevant info. Arguments: page -- html string with search page's html. artist -- artist to to check for in results. If found only results with that artist are returned. album -- album to check for in results. If found only results with with the album are returned. id_field -- key to use for the album id found. Return a tuple with the first element being == True if the list was truncated with only matching artist/albums. """ page = get_encoding(page, True, 'utf8')[1] soup = parse_html.SoupWrapper(parse_html.parse(page)) result_table = soup.find('ul', {'class': 'search-results'}) try: results = result_table.find_all('div', {'class': 'info'}) except AttributeError: return [] albums = [parse_search_element(result) for result in results] d = {} if artist and album: d = {'artist': artist, 'album': album} top = [album for album in albums if equal(d, album, True)] elif album: d = {'album': album} top = [album for album in albums if equal(d, album, True, ['album'])] if not top: top = [album for album in albums if equal(d, album, False, ['album'])] elif artist: d = {'artist': artist} top = [album for album in albums if equal(d, album, True, ['artist'])] if not ret: top = [album for album in albums if equal(d, album, False, ['artist'])] else: top = [] return False, albums
def parse_albumpage(page, artist=None, album=None): info = {} album_soup = parse_html.SoupWrapper(parse_html.parse(page)) artist = album_soup.find('div', {'class': 'album-artist'}) album = album_soup.find('div', {'class': 'album-title'}) release_title = album_soup.find('h3', 'release-title') if release_title: album = release_title details = album_soup.find('p', {'class': 'release-details'}) if details: info['release'] = convert(details.string) if not artist: artist = album_soup.find('h3', 'release-artist') if album is None: info.update({'artist': convert(artist.string), 'album': ''}) else: info.update({ 'artist': convert(artist.string), 'album': convert(album.string) }) info['albumartist'] = info['artist'] sidebar = album_soup.find('div', {'class': 'sidebar'}) info.update(parse_sidebar(sidebar)) info.update(convert_year(info)) content = album_soup.find('section', {'class': 'review read-more'}) if content: info.update(parse_review(content)) #swipe = main.find('div', {'id':"similar-albums", 'class':"grid-gallery"}) #info.update(parse_similar(swipe)) info = dict( (spanmap.get(k, k), v) for k, v in info.iteritems() if not isempty(v)) return [info, parse_tracks(album_soup, info)]
def load_data(files): musics = [] for idx, f in enumerate(files): music = parse_html.parse(f) chords = music.chords keys = music.keys dat = [] for chord in chords: new_chords = conv2cof.to_cof(chord, keys) x = [] y = [] for chord in new_chords: x.append(chord[0]) y.append(chord[1]) if len(x) < 3: print x dat.append([x, y]) music.dat = dat musics.append(music) return musics
def parse_albumpage(page, artist=None, album=None): info = {} album_soup = parse_html.SoupWrapper(parse_html.parse(page)) artist = album_soup.find('div', {'class': 'album-artist'}) album = album_soup.find('div', {'class': 'album-title'}) release_title = album_soup.find('h3', 'release-title') if release_title: album = release_title details = album_soup.find('p', {'class': 'release-details'}) if details: info['release'] = convert(details.string) if not artist: artist = album_soup.find('h3', 'release-artist') if album is None: info.update({'artist': convert(artist.string), 'album': ''}) else: info.update({'artist': convert(artist.string), 'album': convert(album.string)}) info['albumartist'] = info['artist'] sidebar = album_soup.find('div', {'class': 'sidebar'}) info.update(parse_sidebar(sidebar)) info.update(convert_year(info)) content = album_soup.find('section', {'class': 'review read-more'}) if content: info.update(parse_review(content)) #swipe = main.find('div', {'id':"similar-albums", 'class':"grid-gallery"}) #info.update(parse_similar(swipe)) info = dict((spanmap.get(k,k),v) for k, v in info.iteritems() if not isempty(v)) return [info, parse_tracks(album_soup, info)]
baseURL = "https://www.indeed.com/jobs?q=&l=Atlanta%2C+GA&filter={}start={}" filterNumbers = list(range(0, 10)) startNumbers = list(range(0, 1000, 10)) minSleepTime = 1.00 randomRange = 1.50 if __name__ == "__main__": data = DataModel.load(model_name) compoundNumber = [(filterNumber, startNumber) for filterNumber in filterNumbers for startNumber in startNumbers] print("Request Count:", len(compoundNumber)) for filterNumber, startNumber in tqdm(compoundNumber): url = baseURL.format(filterNumber, startNumber) print("Request URL: ", url) response = requests.get(url) items = parse(response.text) for item in items: data.add(item) randomSleep = minSleepTime + random.random() * randomRange print("Sleep for: ", randomSleep, "s") time.sleep(randomSleep) DataModel.save(data, model_name)
def data_process(url): output_str = u""" <!DOCTYPE html> <html> <head> <style> table { width:100%; } th, td { padding: 5px; text-align: left; } table tr:nth-child(even) { background-color: #eee; } table tr:nth-child(odd) { background-color:#fff; } table th { background-color: black; color: white; } </style> </head> <body> """ for value in parse(url): result_dict = {} words = value.split() for word in words: if word in word_dict: for val in word_dict[word]: if val in result_dict: result_dict[val] +=1 else: result_dict[val] = 1 best_matches = sorted(result_dict.items(),key=lambda x:x[1],reverse=True)[:3] output_str += "The arabic input is: <br/>" output_str += value + "<br/>" output_str += "The best matches in decreasing order:<br/>" output_str += "<table style=\"width:100%\"><tr><th>Sura</th><th>Aya</th><th>Match(%)</th></tr>" for index,match in enumerate(best_matches): output_str += u"<tr><td>{}</td><td>{}</td>".format(dic_in[u'sura'][best_matches[index][0][0]][u'name'], best_matches[index][0][1]+1) best = dic_in['sura'][best_matches[index][0][0]][u'aya'][best_matches[index][0][1]][u'text'].split() count = 0.0 for input_word,compare_word in zip(words,best): if input_word == compare_word: count += 1.0 else: count += zaro_calculate(input_word, compare_word) output_str += "<td>{}</td></tr>".format(count/len(words)*100) output_str += "</table>" output_str += "</body></html>" #print type(result_dict.items()[0]) return output_str
from parse_html import parse from zaro_winkler import zaro_calculate import json url = 'http://abuaminaelias.com/brotherhood-in-the-quran-and-sunnah/' with open('output.pickle', 'rb') as data: word_dict = pickle.load(data) with open('output.json','r') as json_data: dic_in = json.load(json_data) matchCases = {} for value in parse(url): #print "---------------------------->" result_dict = {} words = value.split() #print len(words) for word in words: if word in word_dict: for val in word_dict[word]: if val in result_dict: result_dict[val] +=1 else: result_dict[val] = 1 #matchCases[value] = sorted(result_dict.items(),key=lambda x:x[1],reverse=True)[:3] best_results = sorted(result_dict.items(),key=lambda x:x[1],reverse=True)[:3]
def data_process(url): output_str = u""" <!DOCTYPE html> <html> <head> <style> table { width:100%; } th, td { padding: 5px; text-align: left; } table tr:nth-child(even) { background-color: #eee; } table tr:nth-child(odd) { background-color:#fff; } table th { background-color: black; color: white; } </style> </head> <body> """ for value in parse(url): result_dict = {} words = value.split() for word in words: if word in word_dict: for val in word_dict[word]: if val in result_dict: result_dict[val] += 1 else: result_dict[val] = 1 best_matches = sorted(result_dict.items(), key=lambda x: x[1], reverse=True)[:3] output_str += "The arabic input is: <br/>" output_str += value + "<br/>" output_str += "The best matches in decreasing order:<br/>" output_str += "<table style=\"width:100%\"><tr><th>Sura</th><th>Aya</th><th>Match(%)</th></tr>" for index, match in enumerate(best_matches): output_str += u"<tr><td>{}</td><td>{}</td>".format( dic_in[u'sura'][best_matches[index][0][0]][u'name'], best_matches[index][0][1] + 1) best = dic_in['sura'][best_matches[index][0][0]][u'aya'][ best_matches[index][0][1]][u'text'].split() count = 0.0 for input_word, compare_word in zip(words, best): if input_word == compare_word: count += 1.0 else: count += zaro_calculate(input_word, compare_word) output_str += "<td>{}</td></tr>".format(count / len(words) * 100) output_str += "</table>" output_str += "</body></html>" #print type(result_dict.items()[0]) return output_str
from parse_html import parse from zaro_winkler import zaro_calculate import json url = 'http://abuaminaelias.com/brotherhood-in-the-quran-and-sunnah/' with open('output.pickle', 'rb') as data: word_dict = pickle.load(data) with open('output.json', 'r') as json_data: dic_in = json.load(json_data) matchCases = {} for value in parse(url): #print "---------------------------->" result_dict = {} words = value.split() #print len(words) for word in words: if word in word_dict: for val in word_dict[word]: if val in result_dict: result_dict[val] += 1 else: result_dict[val] = 1 #matchCases[value] = sorted(result_dict.items(),key=lambda x:x[1],reverse=True)[:3] best_results = sorted(result_dict.items(),