def split_files(search_key): """ Loops through files, splitting Rashi and ktav yad rashi into 2 different files. Recommend running check_demarcation first. :param search_key: key to find end of Rashi and beginning of ktav yad rashi """ # loop through files for page in range(functions.get_page(72, 'b'), functions.get_page(94, 'a') + 1): file_name = u'מנחות_{}.txt'.format(functions.get_daf(page)) rashi = codecs.open(u'rashi_fixed/{}'.format(file_name), 'w', 'utf-8') ktav_yad_rashi = codecs.open(u'ktav_yad_rashi/{}'.format(file_name), 'w', 'utf-8') original = codecs.open(file_name, 'r', 'utf-8') found = False for line in original: if line.find(search_key) != -1: found = True if not found: rashi.write(line) if found: ktav_yad_rashi.write(line) original.close() rashi.close() ktav_yad_rashi.close()
def check_demarcation(search_key): """ Sanity check function: make sure a certain search key can be used to find the beginning of the ktav yad rashi in text. Prints out files missing the search key, as well as number of files searched and number of keys found. :param search_key: A string indicating where ktav yad rashi begins. """ total, count = 0, 0 # loop through files for page in range(functions.get_page(72, 'b'), functions.get_page(94, 'a')+1): file_name = u'מנחות_{}.txt'.format(functions.get_daf(page)) rashi_file = codecs.open(file_name, 'r', 'utf-8') total += 1 found_key = False for line in rashi_file: if line.find(search_key) != -1: found_key = True count += 1 break if not found_key: print file_name rashi_file.close() print '{} files scanned, found key in {} file'.format(total, count)
def get_plist(self, url, st): # Create playlist content = func.get_page(url).text # New bs object soup = BeautifulSoup(content, 'lxml') # print(soup) # exit(1) plist = soup.find(name='ul', attrs={'class': 'f-hide'}) # Pass the csv file name according to the toggle if st.toggle == False: st.csv_fname = st.playlist_title = soup.find(name='h2', class_='f-ff2').string # Filter data for song in plist.find_all(name='li'): # id id = re.search('=([0-9]+)', song.a['href']) # Avoid repetitive recording of song names id_foo = id.group(1) if id_foo not in self.songs['id']: self.songs['id'].append(id_foo) # name song_name = song.a.string self.songs['name'].append(song_name) # url song_url = 'https://music.163.com' + song.a['href'] self.songs['url'].append(song_url)
def extract_degree_info(): ''' Extract and store information about individual degrees by visiting degree URLs. ''' degrees = [] for url in degree_urls: soup = get_page(url) if not soup: continue degree = {'url': url, 'source': 'Edison Project University Programs List'} # Degree name a = soup.header.h1.find('a', string=True) if a: degree['degree'] = a.string try: strings = [tag for tag in soup.main.stripped_strings] for i, string in enumerate(strings): if string.split()[0].endswith(':'): degree[string.strip(':').lower()] = strings[i+1] except AttributeError: pass degrees.append(degree) to_json(degree) return degrees
def get_lyric(self): """获得歌词""" self.songs['lyric'] = [] total = len(self.songs['id']) for song_id in self.songs['id']: url = 'http://music.163.com/api/song/lyric?os=pc&id=' \ + song_id \ + '&lv=-1&kv=-1&tv=-1' # 获得歌词内容 content = func.get_page(url).json() if 'lrc' in content and 'nolyric' not in content and content[ 'lrc'] is not None: lyric = content['lrc']['lyric'] # 清洗歌词 lyric = re.sub('\[.*?\]', '', lyric) self.songs['lyric'].append(lyric) self.only_lyric.append(lyric) print('completed ' + str( round(self.songs['id'].index(song_id) / total * 100, 2)) + '% ', end='') print('added lyric id: ' + song_id) else: # 填充,避免出现浮点数的空值 self.songs['lyric'].append('ThisShallBeIgnored')
def extract_degree_info(): degrees = [] url = 'https://analytics.ncsu.edu/?page_id=4184' soup = get_page(url) if not soup: return degree_header = soup.find( string="CHRONOLOGY OF GRADUATE PROGRAMS IN ANALYTICS AND DATA SCIENCE") p_tags = degree_header.find_all_next('p') for tag in p_tags: degree = {'url': url, 'source': 'Analytics NCSU'} # Degree title a = tag.find('a', string=True) if a: degree['degree'] = a.string # Degree URL a = tag.find('a', href=True) if a: degree['url'] = a.get('href') # First year of enrollment try: h3 = tag.find_previous('h3') degree['first_enrolled'] = int( h3.string.replace('\u2022', '').strip()) except ValueError: degree['first_enrolled'] = None # University and department are found in a single string separated by commas. try: # Remove link tag to access university and department string in p tag. tag.a.decompose() text = tag.string.strip().split(',') text = [value.strip() for value in text if value is not ''] except AttributeError: degree['university'] = None degree['department'] = None else: # Values can be assigned with certainty when there are only two commas. if len(text) is 2: degree['university'] = text[0] degree['department'] = text[1] # Otherwise (like when a university name includes a comma), both fields are # assigned the entire string and can be cleaned up later. else: degree['university'] = ','.join(text) degree['department'] = ','.join(text) degrees.append(degree) return degrees
def extract_degree_links(): ''' Find URLs that lead to degree pages. ''' degree_urls = [] for url in source_urls: soup = get_page(url) if soup: for td in soup.find_all('td', class_='views-field views-field-title'): url = td.find('a', href=True).get('href') degree_urls.append('http://edison-project.eu' + url) return degree_urls
def extract_degree_info(): degrees = [] url = 'https://www.datasciencegraduateprograms.com/school-listing/#context/api/listings/prefilter' soup = get_page(url) if not soup: return for tag in soup.find('div', class_='stateheader-departments').find_all_next('a', href=True): degree = {'url': url, 'source': 'Data Science Graduate Programs'} # Degree title parent = tag.parent if parent: degree['degree'] = parent.string # Degree university h3 = tag.find_previous('h3', string=True) if h3: degree['university'] = h3.string # Degree department strong = tag.find_previous('strong', string=True) if strong: degree['department'] = strong.string # Degree URL degree['url'] = tag.get('href') # Degree state h2 = tag.find_previous('h2', string=True) if h2: degree['state'] = h2.string # Misc. properties of degree ul = tag.find_next('ul') if ul: degree['properties'] = [li.string for li in ul.find_all('li')] # Degree accredidation info em = tag.find_next('em', string=True) if em: degree['accredidation'] = em.string degrees.append(degree) to_json(degree) return degrees
def get_playlists(self, st): try: with open('res/' + st.search_keyword + '.json', encoding='UTF-8') as f: p_json = json.load(f) except FileNotFoundError: url = 'http://music.163.com/api/search/get/web?csrf_token=hlpretag=&hlposttag=&s={' \ + st.search_keyword + '}&type=1000&offset=0&total=true&limit=' + str(st.result_limit) p_json = func.get_page(url).json() with open('res/' + st.search_keyword + '.json', 'w', encoding='UTF-8') as k: text = json.dumps(p_json, ensure_ascii=False) k.write(text) result = p_json['result'] self.playlists = result['playlists']
def update_fund_code_info(self): content = functions.get_page( "http://fund.eastmoney.com/js/fundcode_search.js").content.decode( ) # Error: json.decoder.JSONDecodeError: Unexpected UTF-8 BOM (decode using utf-8-sig): line 1 column 1 (char 0) content = content.replace("var r = ", "", 1) content = content.replace(";", "", 1) content = content.encode().decode('utf-8-sig') content = json.loads(content) # with open("resource/fundcode_info/fundcode_search.js", 'rb') as json_obj: # content = json.load(json_obj) df = pd.DataFrame(content, columns=["CODE", "WORD", "NAME", "TYPE", "PINYIN"]) df.set_index("CODE", inplace=True) df = df.drop('PINYIN', axis=1) df.to_csv("resource/fundcode_info/fundcode_search.csv", encoding="UTF-8")
def get_page_data(self, data, url, params): #从网页中获取数据 content = functions.get_page(url, params=params).content soup = BeautifulSoup(content, "lxml") tbody = soup.find(name="tbody").find_all(name="tr") for tdays in tbody: tds = tdays.find_all(name="td") if tds[0].text == "暂无数据!": continue date = tds[0].text data[date] = [] if date not in self.nw_df["DATE"].tolist(): self.nw_df = self.nw_df.append({"DATE": date}, ignore_index=True) if date not in self.aw_df["DATE"].tolist(): self.aw_df = self.aw_df.append({"DATE": date}, ignore_index=True) if date not in self.wi_df["DATE"].tolist(): self.wi_df = self.wi_df.append({"DATE": date}, ignore_index=True) nw_index = self.nw_df[self.nw_df["DATE"] == date].index.tolist() aw_index = self.aw_df[self.aw_df["DATE"] == date].index.tolist() wi_index = self.wi_df[self.wi_df["DATE"] == date].index.tolist() for i in range(len(tds)): data[date].append(tds[i].text) flag = False if i == 1: self.nw_df.loc[nw_index, params["code"]] = tds[i].text flag = True if i == 2: self.aw_df.loc[aw_index, params["code"]] = tds[i].text flag = True if i == 3: self.wi_df.loc[wi_index, params["code"]] = tds[i].text flag = True if flag: self.progress += 0.3333333 return data
def get_lyric(self): """获得歌词""" self.songs['lyric'] = [] total = len(self.songs['id']) for song_id in self.songs['id']: url = 'http://music.163.com/api/song/lyric?os=pc&id=' \ + song_id \ + '&lv=-1&kv=-1&tv=-1' # Get lyrics content content = func.get_page(url).json() # print(content) # exit(1) if 'lrc' in content and 'nolyric' not in content and content[ 'lrc'] is not None: lyric = content['lrc']['lyric'] # Clean the lyrics, clean the time, clean the arrangement, etc. lyric = re.sub('\[.*?\]', '', lyric) templist = lyric.split('\n') lyric = '' for t in templist: #print(len(re.findall(':', t))) if len(re.findall(':', t)) != 0 or len(re.findall(':', t)) != 0: continue else: lyric = lyric + t + '\n' # print(lyric) # exit(1) self.songs['lyric'].append(lyric) self.only_lyric.append(lyric) print('completed lyric' + str( round(self.songs['id'].index(song_id) / total * 100, 2)) + '% ', end='') print('added lyric id: ' + song_id) else: # Fill to avoid null values of floating point numbers self.songs['lyric'].append('ThisShallBeIgnored')
def get_plist(self, url, st): # 建立歌单 content = func.get_page(url).text # 新建bs对象 soup = BeautifulSoup(content, 'lxml') plist = soup.find(name='ul', attrs={'class': 'f-hide'}) # 根据toggle传递csv文件名 if st.toggle == False: st.csv_fname = st.playlist_title = soup.find(name='h2', class_='f-ff2').string # 筛选数据 for song in plist.find_all(name='li'): # id id = re.search('=([0-9]+)', song.a['href']) # 避免重复记录歌名 id_foo = id.group(1) if id_foo not in self.songs['id']: self.songs['id'].append(id_foo) # name song_name = song.a.string self.songs['name'].append(song_name) # url song_url = 'https://music.163.com' + song.a['href'] self.songs['url'].append(song_url)
def get_detail(self): self.songs['songer'] = [] self.songs['fee'] = [] self.songs['album'] = [] self.songs['publishTime'] = [] self.songs['company'] = [] self.songs['popularity'] = [] self.songs['duration'] = [] self.songs['score'] = [] total = len(self.songs['id']) for song_id in self.songs['id']: url ='http://music.163.com/api/song/detail/?id='+ song_id \ +'&ids=%5B'+song_id+'%5D' # Get song detail content = func.get_page(url).json() name = content['songs'][0]['artists'][0]['name'] fee = content['songs'][0]['fee'] album = content['songs'][0]['album']['name'] publishTime = content['songs'][0]['album']['publishTime'] company = content['songs'][0]['album']['company'] popularity = content['songs'][0]['popularity'] duration = content['songs'][0]['duration'] score = content['songs'][0]['score'] if name is not None and name != '': self.songs['songer'].append(name) else: self.songs['songer'].append('UnKown') if fee is not None: self.songs['fee'].append(fee) else: self.songs['fee'].append(0) if album is not None and album != '': self.songs['album'].append(album) else: self.songs['album'].append('UnKown') if publishTime is not None: self.songs['publishTime'].append(publishTime) else: self.songs['publishTime'].append(1568304000000) if company is not None and company != '': self.songs['company'].append(company) else: self.songs['company'].append('UnKown') if popularity is not None: self.songs['popularity'].append(popularity) else: self.songs['popularity'].append(50) if duration is not None: self.songs['duration'].append(duration) else: self.songs['duration'].append(93000) if score is not None: self.songs['score'].append(score) else: self.songs['score'].append(50) print( 'completed detail' + str(round(self.songs['id'].index(song_id) / total * 100, 2)) + '% ', end='') print('added detail id: ' + song_id) time.sleep(random.uniform(1, 2))
def list_housing(n): '''n表示第几页''' rows = get_page(n) recommended = recommend() return render_template(r'list.html', rows=rows, recommended=recommended)