def on_closing(): try: Scr.end(scraper) except Exception: pass try: Ins.end(inserter) except Exception: pass window.destroy()
def scrape(): page = fb_input.get() if "https://www.facebook.com/" in page: place = Scr.scrape(page, scraper) change_logo() change_bg() nazwa_input.delete(0, END) nazwa_input.insert(INSERT, place.nazwa) opis_input.delete(1.0, END) opis_input.insert(INSERT, place.opis) miasto_input.delete(0, END) miasto_input.insert(INSERT, place.miasto) adres_input.delete(0, END) adres_input.insert(INSERT, place.adres) ig_input.delete(0, END) ig_input.insert(INSERT, place.ig) email_input.delete(0, END) email_input.insert(INSERT, place.email) tel_input.delete(0, END) tel_input.insert(INSERT, place.tel) web_input.delete(0, END) web_input.insert(INSERT, place.web) else: fb_input.delete(0, END) fb_input.insert(INSERT, "Nieprawidłowy adres.")
def main(): conn = sqlite3.connect('music163-2.db') cur = conn.cursor() cur.execute("""CREATE TABLE IF NOT EXISTS songs ( id INTEGER PRIMARY KEY, name TINYTEXT, artist TINYTEXT, album TINYTEXT, duration INTEGER, lyric TEXT(3000), comment MEDIUMTEXT, modified_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP)""") cur.close() listID = ['148858141', '148822545', '148821771', '148822543', '148813956', '148818810'] baseURL = 'http://music.163.com/playlist?id=' for ID in listID: with open('./logfile-2/{}.log'.format(ID), 'w') as f: sys.stdout = f sys.stderr = f url = baseURL + ID songs = Scrape.getSongDetail(url) dumpToDB(songs, conn) sleep(abs(random.gauss(6, 2.5)) % 15) conn.close()
def result(link): # Main function for invoking all the respective scripts link = str(link) Scrape.starter(link) # Call the starter function in the Scrape script, which runs the Scrape script compeletely global results # Set the 'results' variable as global, so that it does not the shadow the already defined 'results' variable if checkDomain(): if checkLinks(): results.set("This site is not a fake news site") # If the response is True, none of the hyperlinked URLs are present in the database else: results.set("This site has sources from a fake site") # If the response is False, one or more hyperlinked URLs are present in the database else: results.set("This site is a fake news site")
def run(entry): Alchemy.AD_cls() brands = Alchemy.BL_sel() ad_data = Scrape.main(brands) msgbox(5, 0) #Alchemy.AD_ins(ad_data) return
def PostComments(): for post in AllPosts: if post.score < 40: keyphrase = key_phrase_analysis.getKeyPhrases(post.message) for key in keyphrase: print('') graph.put_comment(object_id=post.id, message=Scrape.GetQuotes(post.category))
def checkLinks(): # Check the domain of all the hyperlinks in the given URL fo = open('links.txt') for line in fo: # Call the getStrippedLink function in the Scrape script, to get only the domain of the hyperlinked URL link = Scrape.getStrippedLink(line) if not Compare.check(link): return False return True
def launchScrapeHandler(timeout): scrapeHandler = Scrape.ScrapeHandler(timeout, DATABASE_PATH + DATABASE_NAME, DATABASE_TABLE_NAME, SCRAPE_PATH) # launching fetch handler thread object scrapeHandler = threading.Thread(target=scrapeHandler.fetch, args=()) scrapeHandler.start() return scrapeHandler
def collect(): print('\nCollecting @ ', datetime.now()) # current time start = t.time() articles_list = Scrape.get_articles() # scraping data DBM.populate_db(articles_list) # saving to database DBM.prune_db(days) # pruning articles print(str(t.time() - start), 'seconds') # timing the execution print('Next collection @ ~', datetime.now() + timedelta(minutes=min_interval), '\n') # approx. next collection time
def scrape(): data = Scrape.scrapping() #activates the scrapping function return render_template("scrape.html", xyz = data )
# local data # we will collect 100 articles from each site local_urls = [ 'http://www.nbcwashington.com/news/local/?page=', 'http://www.nbcnewyork.com/news/local/?page=', "http://www.nbcchicago.com/news/local/?page=", "http://www.nbcphiladelphia.com/news/local/?page=" ] # collect first 100 links to an article store in dict based on above url local_articles = [] i = 1 page = 1 scrapper = Scrape.NBCLocalScrape() for url in local_urls: local_articles.extend(scrapper.get_article_urls(url, page)) while len(local_articles) < i * 100: page += 1 local_articles.extend(scrapper.get_article_urls(url, page)) i += 1 # print(local_articles) # output each article to a txt file under training, local-00i.txt local_data = "" for article in local_articles: # print(article) local_data += scrapper.get_article_text("http:" + str(article)) + " "
def addNews(): Data = Scrape.scrape_gas() # collection.insert_one(Data) # Scrape_Data = Data.find_one() collection.replace_one({}, Data, upsert=True) return render_template('News.html', Gas_info=Data)
import Scrape cnn = Scrape.CNNScrape() print(cnn.get_article_text("http://www.cnn.com/2017/08/14/politics/trump-condemns-charlottesville-attackers/index.html"))
pass # ---------------------------------------------------- # # Search the Jobs Search(driver,search_bar,result_filter) # If no matching jobs found try: no_job_result = driver.find_element_by_xpath("//div[@class='jobs-search-no-results__image']") print('No Matching Jobs Found') except: pass Instance = Scrape.Scraping(driver) url_list = Instance.get_all_urls(driver) # url_list must not be empty if url_list: for i in range(len(url_list)): if i >= 10: print('Only the first 10 pages of searched results are studied') url_list = url_list[0:10] break final_dict = {} for m in range(len(url_list)): original_job_links, shortened_job_links, job_ids = Instance.get_all_links(driver,url_list[m])
urls = [ "http://127.0.0.1:5000/static/test.html", "http://127.0.0.1:5000/static/test.html", "http://127.0.0.1:5000/static/test.html" ] for url in urls: @Scrape.async_request_scrape(url, Sources.AioHttpSource, Save.TextDataBaseSQLalchemySaver) def saveDataBase( source: str, proxies=None ) -> Save.TextDataBaseSQLalchemySaver.TextDataBaseSQLalchemySaverData: meta = MetaData() engine = create_engine('sqlite:///school_exp.db') id_col = Column('id', Integer) name_col = Column('name', String) school_name_col = Column('school_name', String) major_col = Column('major', String) cols_ = [id_col, name_col, school_name_col, major_col] rows = [(1, 'steve', 'SJSU', 'chemistry'), (2, "Susu", "MIT", "biology"), (3, "Belle", "UCLA", "Simpology")] data = Save.TextDataBaseSQLalchemySaver.TextDataBaseSQLalchemySaverData( engine, meta, cols_, rows, 'test') return data Scrape.run()
tel = tel_input.get() web = web_input.get() if fb and nazwa and miasto and adres: place = Place(nazwa, opis, miasto, adres, fb, ig, email, tel, web) Ins.insert(place, inserter) else: fb_input.delete(0, END) fb_input.insert(INSERT, "Brak inforamcji.") window = Tk() window.title("Facebook Gastronomy Scraper") window.resizable(0, 0) window.protocol("WM_DELETE_WINDOW", on_closing) scraper = Scr.open_driver() inserter = Ins.login() top = Frame(window) top.grid(row=0, column=0) Label(top, text="Facebook: ").grid(row=0, column=0, padx=5, pady=15) fb_input = Entry(top, width=40) fb_input.grid(row=0, column=1, padx=5, pady=15) Button(top, text="Pobierz", width=7, command=scrape).grid( row=0, column=2, padx=5, pady=15) images = Frame(window) images.grid(row=1, column=1)
import Scrape,Parse,UpdWordAt import os target_url = 'http://news.yahoo.co.jp' target_id = 'editorsPick' #Scraping結果 title_scraped_list = [] #meCab分析結果(一文ごとのリスト) title_parsed_list = [] #meCabの単語,品詞,読み仮名 word_attribute_list=[3*[]] #タイトルリストを取得 #ex.[不信任案めぐり与野党神経戦,衛星ひとみ 人為的ミス重なる,,,,] title_scraped_list = Scrape.scrape(target_url,target_id) for elem in title_scraped_list: #センテンスを品詞分解 #改行付きのlistで出力 title_parsed_list.append(Parse.execute(elem)) #titleを単語,品詞,カナのlistに分解 for title_parsed_sentence in title_parsed_list: #一語ずつのlist作成 title_line= title_parsed_sentence.split(os.linesep) for title_sep_words in title_line: