Example #1
0
def on_closing():
    try:
        Scr.end(scraper)
    except Exception:
        pass
    try:
        Ins.end(inserter)
    except Exception:
        pass
    window.destroy()
Example #2
0
def scrape():
    page = fb_input.get()
    if "https://www.facebook.com/" in page:
        place = Scr.scrape(page, scraper)
        change_logo()
        change_bg()
        nazwa_input.delete(0, END)
        nazwa_input.insert(INSERT, place.nazwa)
        opis_input.delete(1.0, END)
        opis_input.insert(INSERT, place.opis)
        miasto_input.delete(0, END)
        miasto_input.insert(INSERT, place.miasto)
        adres_input.delete(0, END)
        adres_input.insert(INSERT, place.adres)
        ig_input.delete(0, END)
        ig_input.insert(INSERT, place.ig)
        email_input.delete(0, END)
        email_input.insert(INSERT, place.email)
        tel_input.delete(0, END)
        tel_input.insert(INSERT, place.tel)
        web_input.delete(0, END)
        web_input.insert(INSERT, place.web)
    else:
        fb_input.delete(0, END)
        fb_input.insert(INSERT, "Nieprawidłowy adres.")
Example #3
0
def main():
    conn = sqlite3.connect('music163-2.db')
    cur = conn.cursor()

    cur.execute("""CREATE TABLE IF NOT EXISTS songs (
        id INTEGER PRIMARY KEY, 
        name TINYTEXT,
        artist TINYTEXT,
        album TINYTEXT,
        duration INTEGER,
        lyric TEXT(3000),
        comment MEDIUMTEXT,
        modified_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP)""")
    cur.close()

    listID = ['148858141', '148822545', '148821771', '148822543', 
        '148813956', '148818810']

    baseURL = 'http://music.163.com/playlist?id='
    for ID in listID:
        with open('./logfile-2/{}.log'.format(ID), 'w') as f:
            sys.stdout = f
            sys.stderr = f
            url = baseURL + ID
            songs = Scrape.getSongDetail(url)
            dumpToDB(songs, conn)
            sleep(abs(random.gauss(6, 2.5)) % 15)
    conn.close()
Example #4
0
def result(link):
    # Main function for invoking all the respective scripts
    link = str(link)
    Scrape.starter(link)
    # Call the starter function in the Scrape script, which runs the Scrape script compeletely
    global results
    # Set the 'results' variable as global, so that it does not the shadow the already defined 'results' variable
    if checkDomain():
        if checkLinks():
            results.set("This site is not a fake news site")
            # If the response is True, none of the hyperlinked URLs are present in the database
        else:
            results.set("This site has sources from a fake site")
            # If the response is False, one or more hyperlinked URLs are present in the database
    else:
        results.set("This site is a fake news site")
Example #5
0
def run(entry):
    Alchemy.AD_cls()
    brands = Alchemy.BL_sel()
    ad_data = Scrape.main(brands)
    msgbox(5, 0)
    #Alchemy.AD_ins(ad_data)
    return
def PostComments():
    for post in AllPosts:
        if post.score < 40:
            keyphrase = key_phrase_analysis.getKeyPhrases(post.message)
            for key in keyphrase:
                print('')
            graph.put_comment(object_id=post.id,
                              message=Scrape.GetQuotes(post.category))
Example #7
0
def checkLinks():
    # Check the domain of all the hyperlinks in the given URL
    fo = open('links.txt')
    for line in fo:
        # Call the getStrippedLink function in the Scrape script, to get only the domain of the hyperlinked URL 
        link = Scrape.getStrippedLink(line)
        if not Compare.check(link):
            return False
    return True
Example #8
0
def launchScrapeHandler(timeout):

    scrapeHandler = Scrape.ScrapeHandler(timeout,
                                         DATABASE_PATH + DATABASE_NAME,
                                         DATABASE_TABLE_NAME, SCRAPE_PATH)

    # launching fetch handler thread object
    scrapeHandler = threading.Thread(target=scrapeHandler.fetch, args=())
    scrapeHandler.start()

    return scrapeHandler
Example #9
0
def collect():
    print('\nCollecting @ ', datetime.now())  # current time
    start = t.time()

    articles_list = Scrape.get_articles()  # scraping data
    DBM.populate_db(articles_list)  # saving to database
    DBM.prune_db(days)  # pruning articles

    print(str(t.time() - start), 'seconds')  # timing the execution
    print('Next collection @ ~',
          datetime.now() + timedelta(minutes=min_interval),
          '\n')  # approx. next collection time
Example #10
0
def scrape():
    
    data = Scrape.scrapping() #activates the scrapping function

    return render_template("scrape.html", xyz = data )
Example #11
0
# local data
# we will collect 100 articles from each site
local_urls = [
    'http://www.nbcwashington.com/news/local/?page=',
    'http://www.nbcnewyork.com/news/local/?page=',
    "http://www.nbcchicago.com/news/local/?page=",
    "http://www.nbcphiladelphia.com/news/local/?page="
]

# collect first 100 links to an article store in dict based on above url
local_articles = []
i = 1
page = 1

scrapper = Scrape.NBCLocalScrape()

for url in local_urls:
    local_articles.extend(scrapper.get_article_urls(url, page))

    while len(local_articles) < i * 100:
        page += 1
        local_articles.extend(scrapper.get_article_urls(url, page))
    i += 1

# print(local_articles)
# output each article to a txt file under training, local-00i.txt
local_data = ""
for article in local_articles:
    # print(article)
    local_data += scrapper.get_article_text("http:" + str(article)) + " "
Example #12
0
def addNews():
    Data = Scrape.scrape_gas()
    # collection.insert_one(Data)
    # Scrape_Data = Data.find_one()
    collection.replace_one({}, Data, upsert=True)
    return render_template('News.html', Gas_info=Data)
Example #13
0
import Scrape

cnn = Scrape.CNNScrape()

print(cnn.get_article_text("http://www.cnn.com/2017/08/14/politics/trump-condemns-charlottesville-attackers/index.html"))
Example #14
0
        pass
    


# ---------------------------------------------------- #
# Search the Jobs
Search(driver,search_bar,result_filter)

# If no matching jobs found
try:
    no_job_result = driver.find_element_by_xpath("//div[@class='jobs-search-no-results__image']")
    print('No Matching Jobs Found')
except:
    pass

Instance = Scrape.Scraping(driver)
url_list = Instance.get_all_urls(driver)

# url_list must not be empty

if url_list: 
    for i in range(len(url_list)):
        if i >= 10:
            print('Only the first 10 pages of searched results are studied')
            url_list = url_list[0:10]
            break

final_dict = {}

for m in range(len(url_list)):
    original_job_links, shortened_job_links, job_ids = Instance.get_all_links(driver,url_list[m])
Example #15
0
urls = [
    "http://127.0.0.1:5000/static/test.html",
    "http://127.0.0.1:5000/static/test.html",
    "http://127.0.0.1:5000/static/test.html"
]
for url in urls:

    @Scrape.async_request_scrape(url, Sources.AioHttpSource,
                                 Save.TextDataBaseSQLalchemySaver)
    def saveDataBase(
        source: str,
        proxies=None
    ) -> Save.TextDataBaseSQLalchemySaver.TextDataBaseSQLalchemySaverData:
        meta = MetaData()
        engine = create_engine('sqlite:///school_exp.db')
        id_col = Column('id', Integer)
        name_col = Column('name', String)
        school_name_col = Column('school_name', String)
        major_col = Column('major', String)
        cols_ = [id_col, name_col, school_name_col, major_col]
        rows = [(1, 'steve', 'SJSU', 'chemistry'),
                (2, "Susu", "MIT", "biology"),
                (3, "Belle", "UCLA", "Simpology")]

        data = Save.TextDataBaseSQLalchemySaver.TextDataBaseSQLalchemySaverData(
            engine, meta, cols_, rows, 'test')
        return data


Scrape.run()
Example #16
0
    tel = tel_input.get()
    web = web_input.get()

    if fb and nazwa and miasto and adres:
        place = Place(nazwa, opis, miasto, adres, fb, ig, email, tel, web)
        Ins.insert(place, inserter)
    else:
        fb_input.delete(0, END)
        fb_input.insert(INSERT, "Brak inforamcji.")


window = Tk()
window.title("Facebook Gastronomy Scraper")
window.resizable(0, 0)
window.protocol("WM_DELETE_WINDOW", on_closing)
scraper = Scr.open_driver()
inserter = Ins.login()

top = Frame(window)
top.grid(row=0, column=0)

Label(top, text="Facebook: ").grid(row=0, column=0, padx=5, pady=15)

fb_input = Entry(top, width=40)
fb_input.grid(row=0, column=1, padx=5, pady=15)

Button(top, text="Pobierz", width=7, command=scrape).grid(
    row=0, column=2, padx=5, pady=15)

images = Frame(window)
images.grid(row=1, column=1)
Example #17
0
import Scrape,Parse,UpdWordAt
import os

target_url = 'http://news.yahoo.co.jp'
target_id = 'editorsPick'

#Scraping結果
title_scraped_list = []
#meCab分析結果(一文ごとのリスト)
title_parsed_list = []
#meCabの単語,品詞,読み仮名
word_attribute_list=[3*[]]

#タイトルリストを取得
#ex.[不信任案めぐり与野党神経戦,衛星ひとみ 人為的ミス重なる,,,,]
title_scraped_list = Scrape.scrape(target_url,target_id)


for elem in title_scraped_list:
    #センテンスを品詞分解
    #改行付きのlistで出力
    title_parsed_list.append(Parse.execute(elem))


#titleを単語,品詞,カナのlistに分解
for title_parsed_sentence in title_parsed_list:

    #一語ずつのlist作成
    title_line= title_parsed_sentence.split(os.linesep)

    for title_sep_words in title_line: