def ranking_domain(self, keyword, lang): # return dataframe domain that exist keyword read_data = ManageFile("WebCrawler", keyword + "_cut" + lang, [], "r") df = pandas.DataFrame(read_data.managefile_main()) csv_ = "" array = [] temp = df[3].str.lower() if (lang == "en"): csv_ = df[temp.str.contains("|".join(self.DOMAIN_en))] array = self.DOMAIN_en elif (lang == "th"): csv_ = df[temp.str.contains("|".join(self.DOMAIN_th))] array = self.DOMAIN_th elif (lang == "all"): csv_ = df[temp.str.contains("|".join(self.DOMAIN))] array = self.DOMAIN write = ManageFile("Hit_Trends", "top_domain", ["keyword", "number"], "w") # sorting domain by number of link has that keyword sorted_dict = {} for i in array: sorted_dict[i] = temp.str.contains(i).sum() a = {} for i in sorted(sorted_dict, key=sorted_dict.get, reverse=True): write.managefile_main([i, sorted_dict[i]]) a[i] = sorted_dict[i] df_out = pandas.DataFrame({"keyword": a.keys(), "number": a.values()}) return df_out
def check_keyword(self, keyword): file_name = 'list_keywords.csv' file_name_Ncsv = 'list_keywords' try: save = open(file_name, "r") df = pandas.read_csv(save) # check that this lists has got search. condition = (df["keywords"] == keyword ) # check that this lists has got search. num = len(df[condition]) # if num > 0 is True it's mean that keyword has got search already. twitter = ManageFile("Twitter", keyword + "_Ncut" + self.lang, ["time", "content", "places"], "r") crawler = ManageFile("WebCrawler", keyword + "_Ncut" + self.lang, ["time", "header", "content", "link"], "r") if (num > 0 and (not twitter.do_it or not crawler.do_it)): return True twitter.close() crawler.close() save.close() save = ManageFile("", file_name_Ncsv, ["keywords"], "a") save.managefile_main([keyword]) except FileNotFoundError: # first time to run. temp = open(file_name, "w", newline='') temp.write("keywords\n") temp.write(f"{keyword}\n") temp.close() return False
def test_managefile_main(self): row_len_old = -1 read = open("Test_write_file/" + "test" + ".csv", "r") reader = csv.reader((line.replace('\0', '') for line in read), delimiter=",") for i in reader: row_len_old += 1 writefile = ManageFile("Test_write_file", "test", ["a", "b", "c"], "w") for i in range(random.randint(0, 10), random.randint(11, 100)): writefile.managefile_main([i * 0, i * 1, i * 2]) writefile.close() row_len_new = -1 read = open("Test_write_file/" + "test" + ".csv", "r") reader = csv.reader((line.replace('\0', '') for line in read), delimiter=",") for i in reader: row_len_new += 1 self.assertGreater(row_len_new, row_len_old)
def hit_trends(self): start = time.time() column = ["keyword", "tweet"] writer = ManageFile("Hit_Trends", "Hit_Trends", column, "w") # WOEID of Bangkok woeid = 1225448 # fetching the trends trends = self.api.trends_place(id=woeid) # printing the information print("The top trends for the location are :") for value in trends: for trend in value['trends']: writer.managefile_main([trend["name"], trend["tweet_volume"]]) #print(trend["name"], trend["tweet_volume"]) print(time.time() - start, "hittwitter")
class Twitter_API: def __init__(self, nlp): # Key and token CONSUMER_KEY = "ku1u0AkXp7DiD8UuDFBD5ejc7" # aka API key CONSUMER_SECRET = "3OifKHMc5Ik7VMUhjoGUu4BZBDLRDLUTeM6Qo2M70OYKqHgpGP" # aka API key secret ACCESS_TOKEN = "1348183052179001347-Sy8D0nHWqhVjKYiQ2cVTNgkv6m1HYW" ACCESS_TOKEN_SECRET = "Tars6ymAzSCwLTTxGfeqR78cJTAhm7c7mfen5UAXKa1WQ" # Authenticate to Twitter self.auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) self.auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) # Create API object self.api = tweepy.API(self.auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) self.nlp2 = nlp def main_twitter(self, lang, count, keyword, since, until, update=False): # ค่าคงที่ OFFSET = 38555555555555 # start open file "a" mode column = ['time', 'content', 'places'] self.writer2 = ManageFile("Twitter", keyword + "_Ncut" + lang, column, "a") self.date_since = datetime.strptime(since + " 00:00:00", "%Y-%m-%d %H:%M:%S") until = datetime.strptime(until, "%Y-%m-%d") + timedelta(days=1) until = str(until).split(" ")[0] count_ = 0 # it's mean count how many tweet. maxId = -1 # starter id moreOFFSET = 0 tricker = True query = keyword # this is word that want to search count = count # The number of results to try and retrieve per page. tweet_mode = "extended" result_type = "current" # when update mode is active while (tricker): print(count_, "round Twitter") print(count_) try: # print("ok1") if (maxId <= 0 and moreOFFSET < 1): # รอบแรก data = self.api.search(q=query, lang=lang, count=count, tweet_mode=tweet_mode, result_type=result_type, until=until) else: # รอบต่อๆไป if (moreOFFSET >= 1): # OFFSET เพิ่มเรื่อยๆๆๆๆ data = self.api.search( q=query, lang=lang, count=count, tweet_mode=tweet_mode, result_type=result_type, max_id=str(maxId - OFFSET - 555555555 - (100000000 * moreOFFSET)), # a x 10^13 until=until) count_ += 1 moreOFFSET += 1 else: # OFFSET ค่าคงที่ data = self.api.search( q=query, lang=lang, count=count, tweet_mode=tweet_mode, result_type=result_type, max_id=str(maxId - OFFSET), # a x 10^13 until=until) # print("ok2") maxId = data[-1].id count_ += 1 # counter tricker = self.write_csv(data, keyword, lang, since, until) # Write infor to .csv except IndexError: # เมื่อ data[-1].id IndexError หรือก็คือไม่มี tweet ไหนเลยที่หาได้เลย # print("no data") moreOFFSET += 1 count_ += 1 if (count_ >= 10): tricker = False except tweepy.error.TweepError: pass self.writer2.close() print("Twitter Done", count_) def update_mode(self, query, lang, count, tweet_mode, result_type, since, until): data = tweepy.Cursor(self.api.search, q=query, lang=lang, count=count, tweet_mode=tweet_mode, result_type=result_type, until=until).items() #tricker = self.write_csv(data, query, lang, since, until) return data def write_csv(self, data, keyword, lang, since, until): # Write file .csv for checking and record infor """column = ['time', 'content', 'places'] self.writer2 = ManageFile("Twitter", keyword+"_Ncut"+lang, column, "a") self.date_since = datetime.strptime(since+" 00:00:00", "%Y-%m-%d %H:%M:%S")""" for infor in data: date_created_at = datetime.strptime(str(infor.created_at), "%Y-%m-%d %H:%M:%S") #print(date_created_at, self.date_since) # when update mode is active time is ignore if (date_created_at < self.date_since): #print(date_created_at, self.date_since, "<") return False all_lang = self.nlp2.detection_lang(infor.full_text) check_lang = lang == all_lang if (lang == "all"): check_lang = ("en" == all_lang) or ("th" == all_lang) if (("RT @" not in infor.full_text) and check_lang): #print([str(infor.created_at), infor.full_text, infor.user.location]) self.writer2.managefile_main([ str(infor.created_at), infor.full_text, infor.user.location ]) #writerow( {'places': infor.user.location, 'time': str(infor.created_at), 'message':infor.full_text, 'link':"-"} ) return True def hit_trends(self): start = time.time() column = ["keyword", "tweet"] writer = ManageFile("Hit_Trends", "Hit_Trends", column, "w") # WOEID of Bangkok woeid = 1225448 # fetching the trends trends = self.api.trends_place(id=woeid) # printing the information print("The top trends for the location are :") for value in trends: for trend in value['trends']: writer.managefile_main([trend["name"], trend["tweet_volume"]]) #print(trend["name"], trend["tweet_volume"]) print(time.time() - start, "hittwitter")
class websites_crawler: def __init__(self, nlp): self.URL_en = [] self.URL_th = [] read_url_en = open("website_crawler_en.txt", "r") read_url_th = open("website_crawler_th.txt", "r") for lib in read_url_en: self.URL_en.append(lib.split("\n")[0]) for lib in read_url_th: self.URL_th.append(lib.split("\n")[0]) self.DOMAIN_en = [] self.DOMAIN_th = [] for ie in self.URL_en: self.DOMAIN_en.append(ie.split("/")[2]) for it in self.URL_th: self.DOMAIN_th.append(it.split("/")[2]) # value ของ set_pattern คือ ระดับความสำคัญ ยิ่งมากคือสำคัญดังนั้นจะลำดับตามนั้น set_pattern = { ("meta", "property", "og:type"): 3, ("meta", "name", "og:type"): 2, ("meta", "property", "og:url"): 1, ("meta", "name", "og:url"): 0 } self.pattern_list = sorted(set_pattern, key=set_pattern.get, reverse=True) self.DOMAIN = self.DOMAIN_en + self.DOMAIN_th self.nlp_web = nlp self.check_bug = open("check_bugA.txt", "w", newline="\n") self.check_thread = open("check_threadA.txt", "w", newline="\n") self.check_data = open("check_dataA.txt", "w", newline="\n") self.check_data_a = open("check_data_aA.txt", "w", newline="\n") self.check_done = open("check_doneA.txt", "w", newline="\n") self.MAX_THREADS = len(self.DOMAIN) #จะ thread ทีละชั้น self.thread_local = threading.local() self.output = [] self.output2 = [] self.output3 = [] self.output4 = [] self.output_write = [[], []] def download_url(self, url, domain, count): links = Counter() try: # checking whice thread is running print(str(self.DOMAIN.index(domain)) + "_A" + domain + "\n") self.check_thread.write( str(self.DOMAIN.index(domain)) + "_A" + domain + "\n") self.check_thread.flush() tage_a_all = [] html_code = [] # requests html code from server if (type(url) == type(str())): session = self.get_session() resp = session.get(url) html_code = resp.content html_page = BeautifulSoup(html_code, "html.parser") tage_a_all = html_page.find_all("a") elif (type(url) == type(bytes())): html_code = url html_page = BeautifulSoup(html_code, "html.parser") tage_a_all = html_page.find_all("a") # find topic for denied link that same in topic topic = self.find_topic(html_code, domain) # find link in tage "a" all page for x in tage_a_all: try: if (x["href"]): temp = self.link_format(x["href"], domain) # same domain can do same = temp.split("/")[2] if (domain == same): # first round is find all link that found if (count == self.count): links += Counter([temp]) else: # since secound round is find denied a link that in output and topic if (temp not in topic and temp not in self.output): links += Counter([temp]) except IndexError: pass except KeyError: pass perfect = [] for i in links.keys(): perfect.append(i) self.check_data_a.write(str(i) + "\n") self.check_data_a.flush() self.output += perfect self.output4 += perfect print("length:", len(perfect), " output:", len(self.output), " topic:", len(topic), " round:", str(count) + "\n") except: error = traceback.format_exc() print(error) self.check_bug.write(str(error) + "\n" + " " + str(url)) self.check_bug.flush() pass return self.output4 def analytics_url(self, link, topic, domain): t__0 = time.time() try: # checking whice thread is running print(str(self.DOMAIN.index(domain)) + "_B") self.check_thread.write( str(self.DOMAIN.index(domain)) + "_B" + domain + "\n") self.check_thread.flush() # requests html code from server session = self.get_session() res = session.get(link, timeout=20) html_code = res.content soup = BeautifulSoup(html_code, "html.parser") topic = self.find_topic(html_code, domain) # if a link is one of the link in topic, it's mean time to denied!! if (link in topic): tage_a_all = soup.find_all("a") self.output2.append(link) self.output3.append(html_code) print(time.time() - t__0) self.check_done.write(str(link) + "\n") self.check_done.flush() return "No" # tage meta pattern type_ = None for pattern in self.pattern_list: type_ = soup.find(pattern[0], {pattern[1]: pattern[2]}) if (type_ != None): break try: # tage meta pattern Rarely!! case if (type_["content"] == ""): # og:type is empty string print(time.time() - t__0) self.check_done.write(str(link) + "\n") self.check_done.flush() return "website" elif (type_["content"] == link): # has same link in og:url meta tags #self.find_message(html_code, link) #self.output_write[0].append(link) #self.output_write[1].append(html_code) self.output2.append(link) self.output3.append(html_code) print(time.time() - t__0) self.check_done.write(str(link) + "\n") self.check_done.flush() return "article" except TypeError: pass except: print("UnkonwError", link) self.check_bug.write( str(traceback.format_exc()) + "\n" + " " + str(link)) self.check_bug.flush() # if meta tage have "article" that mean can write down on files if (type_): x = type_["content"] if (x == "article"): #self.find_message(html_code, link) #self.output_write[0].append(link) #self.output_write[1].append(html_code) self.output2.append(link) self.output3.append(html_code) print(time.time() - t__0) self.check_done.write(str(link) + "\n") self.check_done.flush() return x else: print(time.time() - t__0) self.check_done.write(str(link) + "\n") self.check_done.flush() return "No meta type" # --------------------- it's not a real link --------------------- except requests.exceptions.MissingSchema: print("MissingSchema", link) return "No" except requests.exceptions.InvalidSchema: print("InvalidSchema", link) return "No" except requests.exceptions.SSLError: print("SSLError", link) return "No" except requests.exceptions.ConnectionError: print("ConnectionError", link) return "No" except requests.exceptions.ReadTimeout: print("ReadTimeout", link) return "No" except requests.exceptions.TooManyRedirects: print("TooManyRedirects", link) return "No" except requests.exceptions.ChunkedEncodingError: print("ChunkedEncodingError", link) return "No" except: print("UnkonwError", link) self.check_bug.write( str(traceback.format_exc()) + "\n" + " " + str(link)) self.check_bug.flush() return "No" # ---------------------------------------------------------------- def find_message(self, html_code, url): t_0 = time.time() soup = BeautifulSoup(html_code, 'html.parser', parse_only=SoupStrainer("div")) tit = BeautifulSoup(html_code, 'html.parser') title = tit.find("meta", property="og:title") # find title title = title["content"] if title else "" # find title message = soup.find_all(name="p") # find message temp_message = "" output = [] for i in message: temp_message += i.text + "\n" time_ = self.find_time(html_code) try: data = [ time_[0] + " " + time_[1], str(title), str(temp_message), str(url) ] # =========================== This point is writing =========================== column = ['time', 'header', 'content', 'link'] self.write = ManageFile("WebCrawler/Database", time_[0], column, "a", ["link"]) # ============================================================================= self.write.managefile_main(data) # write file #print("Write file") self.check_data.write( str(url) + " " + str(time.time() - t_0) + "\n") self.check_data.flush() except TypeError: str(datetime.now()).split(" ")[0] def find_time(self, html_code): # find time from website try: soup = BeautifulSoup(html_code, 'html.parser', parse_only=SoupStrainer("script")) date = soup.find_all(name="script") reg = re.compile( r'(?P<date>202\d-\d\d-\d\d)(?P<time>T\d\d:\d\d:\d\d| \d\d:\d\d)' ) ou = reg.search(str(date)) date_output = ou.group("date") time_output = ou.group("time")[1:] return [str(date_output), str(time_output)] except AttributeError: # Ex:: Jan 27 2021 06:31:00:000PM+07:00 ==> 2021-01-27 18:31:00 try: reg = re.compile( r'(?P<date>\w\w\w \d\d \d\d\d\d)(?P<time> \d\d:\d\d:\d\d:000AM| \d\d:\d\d:\d\d:000PM)' ) ou = reg.search(str(date)) date_output = ou.group("date") time_output = ou.group("time")[1:] temp1 = datetime.strptime(date_output, "%b %d %Y") temp2 = datetime.strptime(time_output, "%I:%M:%S:000%p") return [str(temp1).split(" ")[0], str(temp2).split(" ")[1]] except AttributeError: # it isn't Jan 27 2021 06:31:00:000PM+07:00 date_now = str(datetime.now()).split(" ") reg = re.compile( r'(?P<date>202\d-\d\d-\d\d)(?P<time> \d\d:\d\d:\d\d)') ou = reg.search(str(datetime.now())) date_output = ou.group("date") time_output = ou.group("time")[1:] return [str(date_output), str(time_output)] def find_topic(self, html_code, domain): try: # find link in tag nav or header or div #res = requests.get(url, timeout=20) #html_page = res.content set_html_tag = ["nav", "header", "div"] # เอาไปใส่ text ทีหลัง data = [] count = 0 # -------------------------------------- header -------------------------------------- while (data == [] and (count != len(set_html_tag))): # if data is empty list it's still change html_tag to find soup = BeautifulSoup(html_code, 'html.parser', parse_only=SoupStrainer( set_html_tag[count])) data = soup.find_all(name="ul") count += 1 storage = [] for i in data: temp = i.find_all("li") for j in temp: try: g = j.find("a")["href"] g = self.link_format(g, domain) if (g == ""): continue storage.append(g) except TypeError: #print(g) pass except KeyError: #print(g) pass # ------------------------------------------------------------------------------------- # -------------------------------------- tail -------------------------------------- soup1 = BeautifulSoup(html_code, 'html.parser', parse_only=SoupStrainer("footer")) sub_footer = [] for i in soup1.find_all("a"): if (i.get("href") == None): continue footer = self.link_format(i.get("href"), domain) if (footer == ""): continue sub_footer.append(footer) return storage + ["*" * 10] + sub_footer except requests.exceptions.ReadTimeout: return [] except requests.exceptions.TooManyRedirects: return [] # --------------------------------------------------------------------------------- def find_domain(self, url): temp = [] for i in url: temp.append(i.split("/")[2]) return temp def link_format(self, str_input, domain): # if it's empty string str_out set to empty string it's mean it's not link if (str_input == ""): str_out = "" else: str_out = re.search( r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", str_input) # if str_out is None go to forming link to correct full link if (str_out == None): if (str_input[0:2] == "//" and len(str_input) > 3): str_out = "https:" + str_input elif (str_input[0] == "/" and len(str_input) > 3): str_out = "https://" + domain + str_input elif (str_input[0:2] == "./" and len(str_input) > 3): str_out = "https://" + domain + "/" + str_input[2:] #print(str_out) else: str_out = "" else: # if str_out isn't None it's mean str_out is a link can be search str_out = str_out.group() # but some values of str_out isn't exist https:// or http:// if ("https://" in str_out or "http://" in str_out): pass else: str_out = "https://" + str_out return str(str_out) def concurrent_futures(self, func, arg1, arg2, arg3): threads = len(arg1) + 1 r = None with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: executor.map(func, arg1, arg2, arg3) def get_session(self): if not hasattr(self.thread_local, "session"): self.thread_local.session = requests.Session() return self.thread_local.session def searching(self, keyword, lang, since, until): print("Start Crawler") column = ['time', 'header', 'content', 'link'] check = ManageFile( fold_name="WebCrawler", file_name="", column_data=column, mode="a") # file_name="" it's mean do not create file before. temp_until = datetime.strptime(until, "%Y-%m-%d") temp_since = datetime.strptime(since, "%Y-%m-%d") dif = temp_until - temp_since if (dif == timedelta(days=0)): dif = "0 day" print(dif) day = int(str(dif).split(" ")[0]) + 1 array = [] for i in range(day): date = str(temp_since + timedelta(days=i)).split(" ")[0] print(date) df = None if (lang == "en"): df = check.find_copy_to(keyword=keyword, reader="Database\\" + date, column=["link", "header"], condition=[self.DOMAIN_en, keyword], nlp=self.nlp_web) elif (lang == "th"): df = check.find_copy_to(keyword=keyword, reader="Database\\" + date, column=["link", "header"], condition=[self.DOMAIN_th, keyword], nlp=self.nlp_web) elif (lang == "all"): df = check.find_copy_to(keyword=keyword, reader="Database\\" + date, column=["link", "header"], condition=[self.DOMAIN, keyword], nlp=self.nlp_web) array.append(df) if (dif == "0 day"): array.append(pandas.DataFrame(columns=column)) result = pandas.concat(array) target_file = open(check.path + "\\" + keyword + "_cut" + lang + ".csv", "w", newline="") target_file.write(result.to_csv(index=False)) def main_crawler(self, keyword, lang, since, until, update=False): if (update): self.check_data.write("Start: " + str(datetime.now()) + "\n") self.check_data.flush() self.URL = self.URL_en + self.URL_th self.count = 2 url = self.URL.copy() domain = self.DOMAIN.copy() all_time0 = time.time() for count in range(self.count, 0, -1): all_t0 = time.time() # download_url all page t0 = time.time() self.concurrent_futures(self.download_url, url, domain, [count] * len(url)) t1 = time.time() print( f"time: {t1-t0} seconds, length: {len(self.output4)} links. 1" ) # go to thread used for analytics_url method t0 = time.time() domain = self.find_domain(self.output4) self.concurrent_futures(self.analytics_url, self.output4, ["s"] * len(self.output4), domain) t1 = time.time() print( f"time: {t1-t0} seconds, length: {len(self.output4)} links. 3" ) # same link do as one lin url = self.output3.copy() domain = self.find_domain(self.output2) self.output2 = [] self.output3 = [] self.output4 = [] #self.output_write = [[],[]] all_t1 = time.time() cou = len(open("check_data.txt").readlines()) print( f"time: {all_t1-all_t0} seconds, length: {cou-1} links. 4") all_time1 = time.time() cou = len(open("check_data.txt").readlines()) self.check_data.write("time: " + str(all_time1 - all_time0) + " seconds, length: " + str(cou - 1) + " link") self.check_data.flush() print( f"time: {all_time1-all_time0} seconds, length: {cou-1} links.") else: self.searching(keyword, lang, since, until)
def geometry_map(self, keyword, lang): # ------------------------------------Read file------------------------------------ read = ManageFile("Twitter", keyword + "_cut" + lang, ["time", "content", "places"], "r") read_2 = ManageFile("GUI_show", "location_lati_long", ["places", "lati", "long"], "r") read_3 = ManageFile("GUI_show", "non_exist_lati_long", ["places", "lati", "long"], "r") # --------------------------------------------------------------------------------- # ------------------------------------write file------------------------------------ write_exist = ManageFile("GUI_show", "location_lati_long", ["places", "lati", "long"], "a") write_non_exist = ManageFile("GUI_show", "non_exist_lati_long", ["places", "lati", "long"], "a") write_data_show = ManageFile("GUI_show", "map_lati_long", ["places", "lati", "long"], "w") # --------------------------------------------------------------------------------- location = "" # variable that use for checking that location is exist in database location lati = "" longi = "" # temp array variable that use for checking that location is exist in array location # use temp array becouse file sometime isn't wrote finished yet # data is location name that exist & non_exist is location name that nonexistent non_exist = [] data = [] # data form database data_exist = [] data_non_exist = [] count = 0 # Counter number of address that new and old # data_exist is location name that geopy can search first = 0 for i in read_2.managefile_main(): i[0] = self.nlp_main.clear_name_places(i[0]).lower() if (first > 0): data_exist.append(i) first += 1 first = 0 # data_exist is location name that geopy can't search for i in read_3.managefile_main(): i[0] = self.nlp_main.clear_name_places(i[0]).lower() if (first > 0): data_non_exist.append(i) first += 1 # read data form twitter database for find latitude & longitude by geopy first = 0 for i in read.managefile_main(): i[2] = self.nlp_main.clear_name_places(i[2]).lower() if (first > 0): if (i[2] == ""): first += 1 continue try: for j in data_exist: # is it exist in DataBase (file)? if (i[2] == j[0]): location = "exist1" lati = j[1] longi = j[2] write_data_show.managefile_main( [i[2], str(lati), str(longi)]) data.append([i[2], str(lati), str(longi)]) print("exist1") if (location != "exist1"): # is it exist in DataBase (temp array)? for k in data: if (i[2] == k[0]): location = "exist2" lati = k[1] longi = k[2] print("exist2") if (location == "exist2"): write_data_show.managefile_main( [i[2], str(lati), str(longi)]) data.append([i[2], str(lati), str(longi)]) if (location != "exist1" and location != "exist2"): # it's non_exist in DataBase. for p in data_non_exist: # is it can use Geopy without error by data in file? if (i[2] == p[0]): location = "non_exist" print("non_exist") if (i[2] in non_exist): first += 1 continue if (location != "non_exist"): # is it a new address? location2 = self.geolocator.geocode(i[2]) lati = location2.latitude longi = location2.longitude print("Geopy") write_data_show.managefile_main( [i[2], str(lati), str(longi)]) write_exist.managefile_main( [i[2], str(lati), str(longi)]) data.append([i[2], str(lati), str(longi)]) count += 1 location = "" except AttributeError: write_non_exist.managefile_main( [i[2], str(lati), str(longi)]) non_exist.append(i[2]) except geopy.exc.GeocoderUnavailable: write_non_exist.managefile_main( [i[2], str(lati), str(longi)]) non_exist.append(i[2]) except geopy.exc.GeocoderServiceError: pass first += 1 print(len(data), count) # counting location & counting using geopy # ----------------return dataframe location by latitude & longitude & name places---------------- dict_ = {} places_array = [] lati_array = [] long_array = [] for i in data: places_array.append(i[0]) lati_array.append(i[1]) long_array.append(i[2]) dict_["places"] = places_array dict_["lati"] = lati_array dict_["long"] = long_array if (data == []): dict_ = {"places": [], "lati": [], "long": []} df = pandas.DataFrame(dict_) # ------------------------------------------------------------------------------------------------ return df
def cut_text(self, folder, keyword, column, lang, since, until): # -----------------------read file for content----------------------- # เอาไฟล์ที่เลือกเวลาแล้วมาตัวคำ read = None if (folder == "WebCrawler"): read = ManageFile(folder, keyword + "_cut" + lang, column, "r") elif (folder == "Twitter"): read_data = ManageFile(folder, keyword + "_Ncut" + lang, column, "r") # -----------------------อ่านไฟล์เป็น pandas----------------------- csv_data = read_data.managefile_main() pd_data = pandas.DataFrame(csv_data) # -------------------------------------------------------------- # -----------------------เลือกเวลา----------------------- data_ = self.read_time(folder, pd_data, since, until) # ----------------------------------------------------- # -----------------------เขียนไฟล์ชั่วคราว----------------------- data_str = data_.to_csv(index=False) #print(data_str) write_file = open(read_data.path + "\\" + keyword + "_cut" + lang + ".csv", "w", newline="") write_file.write(data_str) write_file.close() # ----------------------------------------------------------- read = ManageFile(folder, keyword + "_cut" + lang, column, "r") else: read = ManageFile(folder, keyword + "_cut" + lang, column, "r") data = read.managefile_main() write_sort_text = ManageFile( "GUI_show", keyword + "_ranking_" + str(folder).lower() + lang, ["keyword", "number"], "w") write_sort_text_all = ManageFile("GUI_show", keyword + "_ranking_all" + lang, ["keyword", "number"], "w") # ------------------------------------------------------------------- # ------------------------------column------------------------------- column_section = 0 if (folder == "WebCrawler"): column_section = 2 elif (folder == "Twitter"): column_section = 1 # ------------------------------------------------------------------- print( "*****************************************" + folder + " Start SENTIMENT & NLP*****************************************") sort_dict = Counter() first = 0 start = time.time() for i in data: # (1) cut text by nlp and do sentiment in the same time if (first > 0): cut1 = self.nlp_main.main_nlp(i[column_section]) if (folder == "WebCrawler"): self.array_sentiment_web.append( self.sentiment_text(cut1, i[column_section], lang)) elif (folder == "Twitter"): self.array_sentiment_twi.append( self.sentiment_text(cut1, i[column_section], lang)) self.array_sentiment.append( self.sentiment_text(cut1, i[column_section], lang)) print(len(self.array_sentiment)) sort_dict += Counter(cut1) first += 1 print( first, time.time() - start, "*****************************************" + folder + " END SENTIMENT & NLP*****************************************") print("ALL: " + str(len(self.array_sentiment)) + ", Twitter:" + str(len(self.array_sentiment_twi)) + ", WebCrawler:" + str(len(self.array_sentiment_web))) # (2) sort word and write file that can use for show in GUI for w in sorted(sort_dict, key=sort_dict.get, reverse=True)[:11]: if (w.lower() != keyword): write_sort_text.managefile_main([w, sort_dict[w]]) write_sort_text_all.managefile_main([w, sort_dict[w]])
def go_twitter(self, lang, count, keyword, since, until): try: print("=" * 10 + "Start Find Twitter" + "=" * 10) read_data = ManageFile("Twitter", keyword + "_Ncut" + lang, ["time", "content", "places"], "r") csv_data = read_data.managefile_main() df_in = pandas.DataFrame(csv_data) # มีของวันไหนบ้าง condition1 = (df_in[0] >= f"{since} 00:00:00") condition2 = (df_in[0] <= f"{until} 23:59:59") temp = [] # temp เก็บวันที่มีในไฟล์นั้นๆ df_out = df_in[0][condition1 & condition2].str.split(" ").apply( lambda x: temp.append(x[0]) if x[0] not in temp else None) for i in range(len(temp)): temp[i] = datetime.strptime(str(temp[i]), "%Y-%m-%d") temp.sort(reverse=True) # -------------------- set since and until time ----------------------- now = datetime.now() past = now - timedelta(days=7) now = datetime.strptime(str(now).split(" ")[0], "%Y-%m-%d") past = datetime.strptime(str(past).split(" ")[0], "%Y-%m-%d") until_new = until since_new = since temp_until = datetime.strptime(until_new, "%Y-%m-%d") temp_since = datetime.strptime(since_new, "%Y-%m-%d") if (temp_until >= temp_since): # set until date if (temp_until > now and temp_since > now): return None else: if (now > temp_until): until_new = until_new else: until_new = str(now).split(" ")[0] # set since date if (temp_until < past and temp_since < past): return None else: if (past < temp_since): since_new = since_new else: since_new = str(past).split(" ")[0] else: return None # --------------------------------------------------------------------- # --------------------- if can't find data ------------------ if (temp == []): #print(since_new, until_new, "DO IT",3) print(since_new, until_new, "DO IT") self.main_twitter(lang, count, keyword, since_new, until_new) return None # -------------------------------------------------------- ######################### only Time period that programe can search ############################# new_array = [] end = None for k in temp: if (k <= now and k >= now - timedelta(days=7)): new_array.append(k) #print(new_array,4) ################################################################################################## # -------------------------------- find starting time ------------------- point = None if (datetime.strptime(until_new, "%Y-%m-%d") not in new_array): # บวก 1 วันเป็นช่วงอ้างอิงให้หาวันเมื่อวาน point = datetime.strptime(until_new, "%Y-%m-%d") + timedelta(days=1) else: point = datetime.strptime(until_new, "%Y-%m-%d") point = point.strftime("%Y-%m-%d") point = datetime.strptime(point, "%Y-%m-%d") #print(point,5) # ----------------------------------------------------------------------- # ------------------------------- find ending time --------------------- if (since_new not in new_array): # กลับไปวันนึงคือการเอาวันพรุ่งนี้ end = datetime.strptime(since_new, "%Y-%m-%d") - timedelta(days=1) new_array.append(end) #print(new_array,6) # ---------------------------------------------------------------------- # ------------------------ find specific time -------------------------- for point_stop in new_array: start = point - timedelta(days=1) stop = point_stop + timedelta(days=1) if (start >= stop): start = str(start).split(" ")[0] stop = str(stop).split(" ")[0] print(start, stop, "DO IT") self.main_twitter(lang, count, keyword, stop, start) else: print(start, stop, "DO NOT DO IT") point = point_stop # ---------------------------------------------------------------------- except IndexError: pass