Beispiel #1
0
    def ranking_domain(self, keyword, lang):
        # return dataframe domain that exist keyword

        read_data = ManageFile("WebCrawler", keyword + "_cut" + lang, [], "r")
        df = pandas.DataFrame(read_data.managefile_main())

        csv_ = ""
        array = []
        temp = df[3].str.lower()
        if (lang == "en"):
            csv_ = df[temp.str.contains("|".join(self.DOMAIN_en))]
            array = self.DOMAIN_en
        elif (lang == "th"):
            csv_ = df[temp.str.contains("|".join(self.DOMAIN_th))]
            array = self.DOMAIN_th
        elif (lang == "all"):
            csv_ = df[temp.str.contains("|".join(self.DOMAIN))]
            array = self.DOMAIN

        write = ManageFile("Hit_Trends", "top_domain", ["keyword", "number"],
                           "w")

        # sorting domain by number of link has that keyword
        sorted_dict = {}
        for i in array:
            sorted_dict[i] = temp.str.contains(i).sum()
        a = {}
        for i in sorted(sorted_dict, key=sorted_dict.get, reverse=True):
            write.managefile_main([i, sorted_dict[i]])
            a[i] = sorted_dict[i]

        df_out = pandas.DataFrame({"keyword": a.keys(), "number": a.values()})

        return df_out
Beispiel #2
0
    def check_keyword(self, keyword):
        file_name = 'list_keywords.csv'
        file_name_Ncsv = 'list_keywords'
        try:
            save = open(file_name, "r")
            df = pandas.read_csv(save)  # check that this lists has got search.
            condition = (df["keywords"] == keyword
                         )  # check that this lists has got search.
            num = len(df[condition])

            # if num > 0 is True it's mean that keyword has got search already.
            twitter = ManageFile("Twitter", keyword + "_Ncut" + self.lang,
                                 ["time", "content", "places"], "r")
            crawler = ManageFile("WebCrawler", keyword + "_Ncut" + self.lang,
                                 ["time", "header", "content", "link"], "r")

            if (num > 0 and (not twitter.do_it or not crawler.do_it)):
                return True

            twitter.close()
            crawler.close()

            save.close()

            save = ManageFile("", file_name_Ncsv, ["keywords"], "a")
            save.managefile_main([keyword])

        except FileNotFoundError:
            # first time to run.
            temp = open(file_name, "w", newline='')
            temp.write("keywords\n")
            temp.write(f"{keyword}\n")
            temp.close()

        return False
Beispiel #3
0
    def test_managefile_main(self):
        row_len_old = -1
        read = open("Test_write_file/" + "test" + ".csv", "r")
        reader = csv.reader((line.replace('\0', '') for line in read),
                            delimiter=",")
        for i in reader:
            row_len_old += 1

        writefile = ManageFile("Test_write_file", "test", ["a", "b", "c"], "w")
        for i in range(random.randint(0, 10), random.randint(11, 100)):
            writefile.managefile_main([i * 0, i * 1, i * 2])
        writefile.close()
        row_len_new = -1
        read = open("Test_write_file/" + "test" + ".csv", "r")
        reader = csv.reader((line.replace('\0', '') for line in read),
                            delimiter=",")
        for i in reader:
            row_len_new += 1

        self.assertGreater(row_len_new, row_len_old)
    def hit_trends(self):
        start = time.time()

        column = ["keyword", "tweet"]

        writer = ManageFile("Hit_Trends", "Hit_Trends", column, "w")

        # WOEID of Bangkok
        woeid = 1225448

        # fetching the trends
        trends = self.api.trends_place(id=woeid)

        # printing the information
        print("The top trends for the location are :")

        for value in trends:
            for trend in value['trends']:
                writer.managefile_main([trend["name"], trend["tweet_volume"]])
                #print(trend["name"], trend["tweet_volume"])

        print(time.time() - start, "hittwitter")
class Twitter_API:
    def __init__(self, nlp):
        # Key and token
        CONSUMER_KEY = "ku1u0AkXp7DiD8UuDFBD5ejc7"  # aka API key
        CONSUMER_SECRET = "3OifKHMc5Ik7VMUhjoGUu4BZBDLRDLUTeM6Qo2M70OYKqHgpGP"  # aka API key secret

        ACCESS_TOKEN = "1348183052179001347-Sy8D0nHWqhVjKYiQ2cVTNgkv6m1HYW"
        ACCESS_TOKEN_SECRET = "Tars6ymAzSCwLTTxGfeqR78cJTAhm7c7mfen5UAXKa1WQ"

        # Authenticate to Twitter
        self.auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
        self.auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

        # Create API object
        self.api = tweepy.API(self.auth,
                              wait_on_rate_limit=True,
                              wait_on_rate_limit_notify=True)

        self.nlp2 = nlp

    def main_twitter(self, lang, count, keyword, since, until, update=False):
        # ค่าคงที่
        OFFSET = 38555555555555
        # start open file "a" mode
        column = ['time', 'content', 'places']
        self.writer2 = ManageFile("Twitter", keyword + "_Ncut" + lang, column,
                                  "a")
        self.date_since = datetime.strptime(since + " 00:00:00",
                                            "%Y-%m-%d %H:%M:%S")
        until = datetime.strptime(until, "%Y-%m-%d") + timedelta(days=1)
        until = str(until).split(" ")[0]

        count_ = 0  # it's mean count how many tweet.
        maxId = -1  # starter id
        moreOFFSET = 0
        tricker = True

        query = keyword  # this is word that want to search
        count = count  # The number of results to try and retrieve per page.
        tweet_mode = "extended"
        result_type = "current"

        # when update mode is active
        while (tricker):
            print(count_, "round Twitter")
            print(count_)
            try:
                # print("ok1")
                if (maxId <= 0 and moreOFFSET < 1):
                    # รอบแรก
                    data = self.api.search(q=query,
                                           lang=lang,
                                           count=count,
                                           tweet_mode=tweet_mode,
                                           result_type=result_type,
                                           until=until)
                else:
                    # รอบต่อๆไป
                    if (moreOFFSET >= 1):
                        # OFFSET เพิ่มเรื่อยๆๆๆๆ
                        data = self.api.search(
                            q=query,
                            lang=lang,
                            count=count,
                            tweet_mode=tweet_mode,
                            result_type=result_type,
                            max_id=str(maxId - OFFSET - 555555555 -
                                       (100000000 * moreOFFSET)),  # a x 10^13
                            until=until)
                        count_ += 1
                        moreOFFSET += 1
                    else:
                        # OFFSET ค่าคงที่
                        data = self.api.search(
                            q=query,
                            lang=lang,
                            count=count,
                            tweet_mode=tweet_mode,
                            result_type=result_type,
                            max_id=str(maxId - OFFSET),  # a x 10^13
                            until=until)
                # print("ok2")
                maxId = data[-1].id
                count_ += 1  # counter
                tricker = self.write_csv(data, keyword, lang, since,
                                         until)  # Write infor to .csv
            except IndexError:
                # เมื่อ data[-1].id IndexError หรือก็คือไม่มี tweet ไหนเลยที่หาได้เลย
                # print("no data")
                moreOFFSET += 1
                count_ += 1
                if (count_ >= 10):
                    tricker = False
            except tweepy.error.TweepError:
                pass

        self.writer2.close()
        print("Twitter Done", count_)

    def update_mode(self, query, lang, count, tweet_mode, result_type, since,
                    until):
        data = tweepy.Cursor(self.api.search,
                             q=query,
                             lang=lang,
                             count=count,
                             tweet_mode=tweet_mode,
                             result_type=result_type,
                             until=until).items()

        #tricker = self.write_csv(data, query, lang, since, until)
        return data

    def write_csv(self, data, keyword, lang, since, until):
        # Write file .csv for checking and record infor
        """column = ['time', 'content', 'places']
        self.writer2 = ManageFile("Twitter", keyword+"_Ncut"+lang, column, "a")
        self.date_since = datetime.strptime(since+" 00:00:00", "%Y-%m-%d %H:%M:%S")"""

        for infor in data:
            date_created_at = datetime.strptime(str(infor.created_at),
                                                "%Y-%m-%d %H:%M:%S")
            #print(date_created_at, self.date_since)
            # when update mode is active time is ignore
            if (date_created_at < self.date_since):
                #print(date_created_at, self.date_since, "<")
                return False

            all_lang = self.nlp2.detection_lang(infor.full_text)
            check_lang = lang == all_lang
            if (lang == "all"):
                check_lang = ("en" == all_lang) or ("th" == all_lang)
            if (("RT @" not in infor.full_text) and check_lang):
                #print([str(infor.created_at), infor.full_text, infor.user.location])
                self.writer2.managefile_main([
                    str(infor.created_at), infor.full_text, infor.user.location
                ])
                #writerow( {'places': infor.user.location, 'time': str(infor.created_at), 'message':infor.full_text, 'link':"-"} )
        return True

    def hit_trends(self):
        start = time.time()

        column = ["keyword", "tweet"]

        writer = ManageFile("Hit_Trends", "Hit_Trends", column, "w")

        # WOEID of Bangkok
        woeid = 1225448

        # fetching the trends
        trends = self.api.trends_place(id=woeid)

        # printing the information
        print("The top trends for the location are :")

        for value in trends:
            for trend in value['trends']:
                writer.managefile_main([trend["name"], trend["tweet_volume"]])
                #print(trend["name"], trend["tweet_volume"])

        print(time.time() - start, "hittwitter")
Beispiel #6
0
class websites_crawler:
    def __init__(self, nlp):
        self.URL_en = []
        self.URL_th = []

        read_url_en = open("website_crawler_en.txt", "r")
        read_url_th = open("website_crawler_th.txt", "r")

        for lib in read_url_en:
            self.URL_en.append(lib.split("\n")[0])

        for lib in read_url_th:
            self.URL_th.append(lib.split("\n")[0])

        self.DOMAIN_en = []
        self.DOMAIN_th = []
        for ie in self.URL_en:
            self.DOMAIN_en.append(ie.split("/")[2])
        for it in self.URL_th:
            self.DOMAIN_th.append(it.split("/")[2])

        # value ของ set_pattern คือ ระดับความสำคัญ ยิ่งมากคือสำคัญดังนั้นจะลำดับตามนั้น
        set_pattern = {
            ("meta", "property", "og:type"): 3,
            ("meta", "name", "og:type"): 2,
            ("meta", "property", "og:url"): 1,
            ("meta", "name", "og:url"): 0
        }
        self.pattern_list = sorted(set_pattern,
                                   key=set_pattern.get,
                                   reverse=True)

        self.DOMAIN = self.DOMAIN_en + self.DOMAIN_th
        self.nlp_web = nlp
        self.check_bug = open("check_bugA.txt", "w", newline="\n")
        self.check_thread = open("check_threadA.txt", "w", newline="\n")
        self.check_data = open("check_dataA.txt", "w", newline="\n")
        self.check_data_a = open("check_data_aA.txt", "w", newline="\n")
        self.check_done = open("check_doneA.txt", "w", newline="\n")

        self.MAX_THREADS = len(self.DOMAIN)  #จะ thread ทีละชั้น
        self.thread_local = threading.local()
        self.output = []
        self.output2 = []
        self.output3 = []
        self.output4 = []
        self.output_write = [[], []]

    def download_url(self, url, domain, count):
        links = Counter()
        try:
            # checking whice thread is running
            print(str(self.DOMAIN.index(domain)) + "_A" + domain + "\n")
            self.check_thread.write(
                str(self.DOMAIN.index(domain)) + "_A" + domain + "\n")
            self.check_thread.flush()
            tage_a_all = []
            html_code = []

            # requests html code from server
            if (type(url) == type(str())):
                session = self.get_session()
                resp = session.get(url)
                html_code = resp.content
                html_page = BeautifulSoup(html_code, "html.parser")
                tage_a_all = html_page.find_all("a")
            elif (type(url) == type(bytes())):
                html_code = url
                html_page = BeautifulSoup(html_code, "html.parser")
                tage_a_all = html_page.find_all("a")

            # find topic for denied link that same in topic
            topic = self.find_topic(html_code, domain)

            # find link in tage "a" all page
            for x in tage_a_all:
                try:
                    if (x["href"]):
                        temp = self.link_format(x["href"], domain)
                        # same domain can do
                        same = temp.split("/")[2]
                        if (domain == same):
                            # first round is find all link that found
                            if (count == self.count):
                                links += Counter([temp])
                            else:
                                # since secound round is find denied a link that in output and topic
                                if (temp not in topic
                                        and temp not in self.output):
                                    links += Counter([temp])
                except IndexError:
                    pass
                except KeyError:
                    pass
            perfect = []
            for i in links.keys():
                perfect.append(i)
                self.check_data_a.write(str(i) + "\n")
                self.check_data_a.flush()
            self.output += perfect
            self.output4 += perfect
            print("length:", len(perfect), " output:", len(self.output),
                  " topic:", len(topic), " round:",
                  str(count) + "\n")
        except:
            error = traceback.format_exc()
            print(error)
            self.check_bug.write(str(error) + "\n" + " " + str(url))
            self.check_bug.flush()
            pass
        return self.output4

    def analytics_url(self, link, topic, domain):
        t__0 = time.time()
        try:
            # checking whice thread is running
            print(str(self.DOMAIN.index(domain)) + "_B")
            self.check_thread.write(
                str(self.DOMAIN.index(domain)) + "_B" + domain + "\n")
            self.check_thread.flush()

            # requests html code from server
            session = self.get_session()
            res = session.get(link, timeout=20)
            html_code = res.content
            soup = BeautifulSoup(html_code, "html.parser")
            topic = self.find_topic(html_code, domain)

            # if a link is one of the link in topic, it's mean time to denied!!
            if (link in topic):
                tage_a_all = soup.find_all("a")
                self.output2.append(link)
                self.output3.append(html_code)
                print(time.time() - t__0)
                self.check_done.write(str(link) + "\n")
                self.check_done.flush()
                return "No"

            # tage meta pattern
            type_ = None
            for pattern in self.pattern_list:
                type_ = soup.find(pattern[0], {pattern[1]: pattern[2]})
                if (type_ != None):
                    break

            try:
                # tage meta pattern Rarely!! case
                if (type_["content"] == ""):
                    # og:type is empty string
                    print(time.time() - t__0)
                    self.check_done.write(str(link) + "\n")
                    self.check_done.flush()
                    return "website"
                elif (type_["content"] == link):
                    # has same link in og:url meta tags
                    #self.find_message(html_code, link)
                    #self.output_write[0].append(link)
                    #self.output_write[1].append(html_code)
                    self.output2.append(link)
                    self.output3.append(html_code)
                    print(time.time() - t__0)
                    self.check_done.write(str(link) + "\n")
                    self.check_done.flush()
                    return "article"
            except TypeError:
                pass
            except:
                print("UnkonwError", link)
                self.check_bug.write(
                    str(traceback.format_exc()) + "\n" + " " + str(link))
                self.check_bug.flush()

            # if meta tage have "article" that mean can write down on files
            if (type_):
                x = type_["content"]
                if (x == "article"):
                    #self.find_message(html_code, link)
                    #self.output_write[0].append(link)
                    #self.output_write[1].append(html_code)
                    self.output2.append(link)
                    self.output3.append(html_code)
                print(time.time() - t__0)
                self.check_done.write(str(link) + "\n")
                self.check_done.flush()
                return x
            else:
                print(time.time() - t__0)
                self.check_done.write(str(link) + "\n")
                self.check_done.flush()
                return "No meta type"

        # --------------------- it's not a real link ---------------------
        except requests.exceptions.MissingSchema:
            print("MissingSchema", link)
            return "No"
        except requests.exceptions.InvalidSchema:
            print("InvalidSchema", link)
            return "No"
        except requests.exceptions.SSLError:
            print("SSLError", link)
            return "No"
        except requests.exceptions.ConnectionError:
            print("ConnectionError", link)
            return "No"
        except requests.exceptions.ReadTimeout:
            print("ReadTimeout", link)
            return "No"
        except requests.exceptions.TooManyRedirects:
            print("TooManyRedirects", link)
            return "No"
        except requests.exceptions.ChunkedEncodingError:
            print("ChunkedEncodingError", link)
            return "No"
        except:
            print("UnkonwError", link)
            self.check_bug.write(
                str(traceback.format_exc()) + "\n" + " " + str(link))
            self.check_bug.flush()
            return "No"
        # ----------------------------------------------------------------

    def find_message(self, html_code, url):
        t_0 = time.time()
        soup = BeautifulSoup(html_code,
                             'html.parser',
                             parse_only=SoupStrainer("div"))
        tit = BeautifulSoup(html_code, 'html.parser')

        title = tit.find("meta", property="og:title")  # find title
        title = title["content"] if title else ""  # find title

        message = soup.find_all(name="p")  # find message

        temp_message = ""
        output = []

        for i in message:
            temp_message += i.text + "\n"

        time_ = self.find_time(html_code)
        try:
            data = [
                time_[0] + " " + time_[1],
                str(title),
                str(temp_message),
                str(url)
            ]
            # =========================== This point is writing ===========================
            column = ['time', 'header', 'content', 'link']
            self.write = ManageFile("WebCrawler/Database", time_[0], column,
                                    "a", ["link"])
            # =============================================================================
            self.write.managefile_main(data)  # write file
            #print("Write file")
            self.check_data.write(
                str(url) + " " + str(time.time() - t_0) + "\n")
            self.check_data.flush()
        except TypeError:
            str(datetime.now()).split(" ")[0]

    def find_time(self, html_code):
        # find time from website
        try:
            soup = BeautifulSoup(html_code,
                                 'html.parser',
                                 parse_only=SoupStrainer("script"))
            date = soup.find_all(name="script")

            reg = re.compile(
                r'(?P<date>202\d-\d\d-\d\d)(?P<time>T\d\d:\d\d:\d\d| \d\d:\d\d)'
            )
            ou = reg.search(str(date))
            date_output = ou.group("date")
            time_output = ou.group("time")[1:]

            return [str(date_output), str(time_output)]
        except AttributeError:
            # Ex:: Jan 27 2021 06:31:00:000PM+07:00 ==> 2021-01-27 18:31:00
            try:
                reg = re.compile(
                    r'(?P<date>\w\w\w \d\d \d\d\d\d)(?P<time> \d\d:\d\d:\d\d:000AM| \d\d:\d\d:\d\d:000PM)'
                )
                ou = reg.search(str(date))
                date_output = ou.group("date")
                time_output = ou.group("time")[1:]

                temp1 = datetime.strptime(date_output, "%b %d %Y")
                temp2 = datetime.strptime(time_output, "%I:%M:%S:000%p")

                return [str(temp1).split(" ")[0], str(temp2).split(" ")[1]]
            except AttributeError:
                # it isn't Jan 27 2021 06:31:00:000PM+07:00
                date_now = str(datetime.now()).split(" ")
                reg = re.compile(
                    r'(?P<date>202\d-\d\d-\d\d)(?P<time> \d\d:\d\d:\d\d)')
                ou = reg.search(str(datetime.now()))
                date_output = ou.group("date")
                time_output = ou.group("time")[1:]

                return [str(date_output), str(time_output)]

    def find_topic(self, html_code, domain):
        try:
            # find link in tag nav or header or div
            #res = requests.get(url, timeout=20)
            #html_page = res.content
            set_html_tag = ["nav", "header", "div"]  # เอาไปใส่ text ทีหลัง
            data = []
            count = 0
            # -------------------------------------- header --------------------------------------
            while (data == [] and (count != len(set_html_tag))):
                # if data is empty list it's still change html_tag to find
                soup = BeautifulSoup(html_code,
                                     'html.parser',
                                     parse_only=SoupStrainer(
                                         set_html_tag[count]))
                data = soup.find_all(name="ul")
                count += 1
            storage = []
            for i in data:
                temp = i.find_all("li")
                for j in temp:
                    try:
                        g = j.find("a")["href"]
                        g = self.link_format(g, domain)
                        if (g == ""):
                            continue
                        storage.append(g)
                    except TypeError:
                        #print(g)
                        pass
                    except KeyError:
                        #print(g)
                        pass
            # -------------------------------------------------------------------------------------

            # -------------------------------------- tail --------------------------------------
            soup1 = BeautifulSoup(html_code,
                                  'html.parser',
                                  parse_only=SoupStrainer("footer"))
            sub_footer = []
            for i in soup1.find_all("a"):
                if (i.get("href") == None):
                    continue
                footer = self.link_format(i.get("href"), domain)
                if (footer == ""):
                    continue
                sub_footer.append(footer)

            return storage + ["*" * 10] + sub_footer
        except requests.exceptions.ReadTimeout:
            return []
        except requests.exceptions.TooManyRedirects:
            return []
            # ---------------------------------------------------------------------------------

    def find_domain(self, url):
        temp = []
        for i in url:
            temp.append(i.split("/")[2])
        return temp

    def link_format(self, str_input, domain):

        # if it's empty string str_out set to empty string it's mean it's not link
        if (str_input == ""):
            str_out = ""
        else:
            str_out = re.search(
                r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",
                str_input)

            # if str_out is None go to forming link to correct full link
            if (str_out == None):
                if (str_input[0:2] == "//" and len(str_input) > 3):
                    str_out = "https:" + str_input
                elif (str_input[0] == "/" and len(str_input) > 3):
                    str_out = "https://" + domain + str_input
                elif (str_input[0:2] == "./" and len(str_input) > 3):
                    str_out = "https://" + domain + "/" + str_input[2:]
                    #print(str_out)
                else:
                    str_out = ""
            else:
                # if str_out isn't None it's mean str_out is a link can be search
                str_out = str_out.group()
                # but some values of str_out isn't exist https:// or http://
                if ("https://" in str_out or "http://" in str_out):
                    pass
                else:
                    str_out = "https://" + str_out

        return str(str_out)

    def concurrent_futures(self, func, arg1, arg2, arg3):
        threads = len(arg1) + 1
        r = None
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            executor.map(func, arg1, arg2, arg3)

    def get_session(self):
        if not hasattr(self.thread_local, "session"):
            self.thread_local.session = requests.Session()
        return self.thread_local.session

    def searching(self, keyword, lang, since, until):
        print("Start Crawler")
        column = ['time', 'header', 'content', 'link']
        check = ManageFile(
            fold_name="WebCrawler", file_name="", column_data=column,
            mode="a")  # file_name="" it's mean do not create file before.
        temp_until = datetime.strptime(until, "%Y-%m-%d")
        temp_since = datetime.strptime(since, "%Y-%m-%d")

        dif = temp_until - temp_since

        if (dif == timedelta(days=0)):
            dif = "0 day"
        print(dif)
        day = int(str(dif).split(" ")[0]) + 1
        array = []

        for i in range(day):
            date = str(temp_since + timedelta(days=i)).split(" ")[0]
            print(date)
            df = None
            if (lang == "en"):
                df = check.find_copy_to(keyword=keyword,
                                        reader="Database\\" + date,
                                        column=["link", "header"],
                                        condition=[self.DOMAIN_en, keyword],
                                        nlp=self.nlp_web)
            elif (lang == "th"):
                df = check.find_copy_to(keyword=keyword,
                                        reader="Database\\" + date,
                                        column=["link", "header"],
                                        condition=[self.DOMAIN_th, keyword],
                                        nlp=self.nlp_web)
            elif (lang == "all"):
                df = check.find_copy_to(keyword=keyword,
                                        reader="Database\\" + date,
                                        column=["link", "header"],
                                        condition=[self.DOMAIN, keyword],
                                        nlp=self.nlp_web)
            array.append(df)
        if (dif == "0 day"):
            array.append(pandas.DataFrame(columns=column))

        result = pandas.concat(array)
        target_file = open(check.path + "\\" + keyword + "_cut" + lang +
                           ".csv",
                           "w",
                           newline="")
        target_file.write(result.to_csv(index=False))

    def main_crawler(self, keyword, lang, since, until, update=False):
        if (update):
            self.check_data.write("Start: " + str(datetime.now()) + "\n")
            self.check_data.flush()
            self.URL = self.URL_en + self.URL_th
            self.count = 2
            url = self.URL.copy()
            domain = self.DOMAIN.copy()

            all_time0 = time.time()

            for count in range(self.count, 0, -1):
                all_t0 = time.time()
                # download_url all page
                t0 = time.time()
                self.concurrent_futures(self.download_url, url, domain,
                                        [count] * len(url))
                t1 = time.time()
                print(
                    f"time: {t1-t0} seconds, length: {len(self.output4)} links. 1"
                )

                # go to thread used for analytics_url method
                t0 = time.time()
                domain = self.find_domain(self.output4)
                self.concurrent_futures(self.analytics_url, self.output4,
                                        ["s"] * len(self.output4), domain)
                t1 = time.time()
                print(
                    f"time: {t1-t0} seconds, length: {len(self.output4)} links. 3"
                )

                # same link do as one lin
                url = self.output3.copy()
                domain = self.find_domain(self.output2)
                self.output2 = []
                self.output3 = []
                self.output4 = []
                #self.output_write = [[],[]]

                all_t1 = time.time()
                cou = len(open("check_data.txt").readlines())
                print(
                    f"time: {all_t1-all_t0} seconds, length: {cou-1} links. 4")

            all_time1 = time.time()
            cou = len(open("check_data.txt").readlines())
            self.check_data.write("time: " + str(all_time1 - all_time0) +
                                  " seconds, length: " + str(cou - 1) +
                                  " link")
            self.check_data.flush()
            print(
                f"time: {all_time1-all_time0} seconds, length: {cou-1} links.")
        else:
            self.searching(keyword, lang, since, until)
Beispiel #7
0
    def geometry_map(self, keyword, lang):
        # ------------------------------------Read file------------------------------------
        read = ManageFile("Twitter", keyword + "_cut" + lang,
                          ["time", "content", "places"], "r")
        read_2 = ManageFile("GUI_show", "location_lati_long",
                            ["places", "lati", "long"], "r")
        read_3 = ManageFile("GUI_show", "non_exist_lati_long",
                            ["places", "lati", "long"], "r")
        # ---------------------------------------------------------------------------------

        # ------------------------------------write file------------------------------------
        write_exist = ManageFile("GUI_show", "location_lati_long",
                                 ["places", "lati", "long"], "a")
        write_non_exist = ManageFile("GUI_show", "non_exist_lati_long",
                                     ["places", "lati", "long"], "a")
        write_data_show = ManageFile("GUI_show", "map_lati_long",
                                     ["places", "lati", "long"], "w")
        # ---------------------------------------------------------------------------------

        location = ""  # variable that use for checking that location is exist in database location
        lati = ""
        longi = ""

        # temp array variable that use for checking that location is exist in array location
        # use temp array becouse file sometime isn't wrote finished yet
        # data is location name that exist & non_exist is location name that nonexistent
        non_exist = []
        data = []

        # data form database
        data_exist = []
        data_non_exist = []

        count = 0  # Counter number of address that new and old

        # data_exist is location name that geopy can search
        first = 0
        for i in read_2.managefile_main():
            i[0] = self.nlp_main.clear_name_places(i[0]).lower()
            if (first > 0):
                data_exist.append(i)
            first += 1
        first = 0
        # data_exist is location name that geopy can't search
        for i in read_3.managefile_main():
            i[0] = self.nlp_main.clear_name_places(i[0]).lower()
            if (first > 0):
                data_non_exist.append(i)
            first += 1

        # read data form twitter database for find latitude & longitude by geopy
        first = 0
        for i in read.managefile_main():
            i[2] = self.nlp_main.clear_name_places(i[2]).lower()
            if (first > 0):
                if (i[2] == ""):
                    first += 1
                    continue
                try:
                    for j in data_exist:  # is it exist in DataBase (file)?
                        if (i[2] == j[0]):
                            location = "exist1"
                            lati = j[1]
                            longi = j[2]
                            write_data_show.managefile_main(
                                [i[2], str(lati), str(longi)])
                            data.append([i[2], str(lati), str(longi)])
                            print("exist1")

                    if (location !=
                            "exist1"):  # is it exist in DataBase (temp array)?
                        for k in data:
                            if (i[2] == k[0]):
                                location = "exist2"
                                lati = k[1]
                                longi = k[2]
                                print("exist2")
                        if (location == "exist2"):
                            write_data_show.managefile_main(
                                [i[2], str(lati), str(longi)])
                            data.append([i[2], str(lati), str(longi)])

                    if (location != "exist1" and location !=
                            "exist2"):  # it's non_exist in DataBase.

                        for p in data_non_exist:  # is it can use Geopy without error by data in file?
                            if (i[2] == p[0]):
                                location = "non_exist"
                                print("non_exist")

                        if (i[2] in non_exist):
                            first += 1
                            continue

                        if (location != "non_exist"):  # is it a new address?
                            location2 = self.geolocator.geocode(i[2])
                            lati = location2.latitude
                            longi = location2.longitude
                            print("Geopy")
                            write_data_show.managefile_main(
                                [i[2], str(lati), str(longi)])
                            write_exist.managefile_main(
                                [i[2], str(lati), str(longi)])
                            data.append([i[2], str(lati), str(longi)])
                            count += 1

                    location = ""
                except AttributeError:
                    write_non_exist.managefile_main(
                        [i[2], str(lati), str(longi)])
                    non_exist.append(i[2])
                except geopy.exc.GeocoderUnavailable:
                    write_non_exist.managefile_main(
                        [i[2], str(lati), str(longi)])
                    non_exist.append(i[2])
                except geopy.exc.GeocoderServiceError:
                    pass
            first += 1

        print(len(data), count)  # counting location & counting using geopy

        # ----------------return dataframe location by latitude & longitude & name places----------------
        dict_ = {}
        places_array = []
        lati_array = []
        long_array = []

        for i in data:
            places_array.append(i[0])
            lati_array.append(i[1])
            long_array.append(i[2])

            dict_["places"] = places_array
            dict_["lati"] = lati_array
            dict_["long"] = long_array

        if (data == []):
            dict_ = {"places": [], "lati": [], "long": []}

        df = pandas.DataFrame(dict_)
        # ------------------------------------------------------------------------------------------------
        return df
Beispiel #8
0
    def cut_text(self, folder, keyword, column, lang, since, until):
        # -----------------------read file for content-----------------------
        # เอาไฟล์ที่เลือกเวลาแล้วมาตัวคำ
        read = None
        if (folder == "WebCrawler"):
            read = ManageFile(folder, keyword + "_cut" + lang, column, "r")
        elif (folder == "Twitter"):
            read_data = ManageFile(folder, keyword + "_Ncut" + lang, column,
                                   "r")

            # -----------------------อ่านไฟล์เป็น pandas-----------------------
            csv_data = read_data.managefile_main()
            pd_data = pandas.DataFrame(csv_data)
            # --------------------------------------------------------------

            # -----------------------เลือกเวลา-----------------------
            data_ = self.read_time(folder, pd_data, since, until)
            # -----------------------------------------------------

            # -----------------------เขียนไฟล์ชั่วคราว-----------------------
            data_str = data_.to_csv(index=False)
            #print(data_str)
            write_file = open(read_data.path + "\\" + keyword + "_cut" + lang +
                              ".csv",
                              "w",
                              newline="")
            write_file.write(data_str)
            write_file.close()
            # -----------------------------------------------------------
            read = ManageFile(folder, keyword + "_cut" + lang, column, "r")
        else:
            read = ManageFile(folder, keyword + "_cut" + lang, column, "r")

        data = read.managefile_main()
        write_sort_text = ManageFile(
            "GUI_show", keyword + "_ranking_" + str(folder).lower() + lang,
            ["keyword", "number"], "w")
        write_sort_text_all = ManageFile("GUI_show",
                                         keyword + "_ranking_all" + lang,
                                         ["keyword", "number"], "w")

        # -------------------------------------------------------------------

        # ------------------------------column-------------------------------
        column_section = 0
        if (folder == "WebCrawler"):
            column_section = 2
        elif (folder == "Twitter"):
            column_section = 1
        # -------------------------------------------------------------------
        print(
            "*****************************************" + folder +
            " Start SENTIMENT & NLP*****************************************")
        sort_dict = Counter()
        first = 0
        start = time.time()
        for i in data:
            # (1) cut text by nlp and do sentiment in the same time
            if (first > 0):
                cut1 = self.nlp_main.main_nlp(i[column_section])

                if (folder == "WebCrawler"):
                    self.array_sentiment_web.append(
                        self.sentiment_text(cut1, i[column_section], lang))
                elif (folder == "Twitter"):
                    self.array_sentiment_twi.append(
                        self.sentiment_text(cut1, i[column_section], lang))
                self.array_sentiment.append(
                    self.sentiment_text(cut1, i[column_section], lang))
                print(len(self.array_sentiment))
                sort_dict += Counter(cut1)
            first += 1
        print(
            first,
            time.time() - start,
            "*****************************************" + folder +
            " END SENTIMENT & NLP*****************************************")
        print("ALL: " + str(len(self.array_sentiment)) + ", Twitter:" +
              str(len(self.array_sentiment_twi)) + ", WebCrawler:" +
              str(len(self.array_sentiment_web)))
        # (2) sort word and write file that can use for show in GUI
        for w in sorted(sort_dict, key=sort_dict.get, reverse=True)[:11]:
            if (w.lower() != keyword):
                write_sort_text.managefile_main([w, sort_dict[w]])
                write_sort_text_all.managefile_main([w, sort_dict[w]])
Beispiel #9
0
    def go_twitter(self, lang, count, keyword, since, until):
        try:
            print("=" * 10 + "Start Find Twitter" + "=" * 10)
            read_data = ManageFile("Twitter", keyword + "_Ncut" + lang,
                                   ["time", "content", "places"], "r")

            csv_data = read_data.managefile_main()
            df_in = pandas.DataFrame(csv_data)

            # มีของวันไหนบ้าง
            condition1 = (df_in[0] >= f"{since} 00:00:00")
            condition2 = (df_in[0] <= f"{until} 23:59:59")

            temp = []  # temp เก็บวันที่มีในไฟล์นั้นๆ
            df_out = df_in[0][condition1 & condition2].str.split(" ").apply(
                lambda x: temp.append(x[0]) if x[0] not in temp else None)
            for i in range(len(temp)):
                temp[i] = datetime.strptime(str(temp[i]), "%Y-%m-%d")
            temp.sort(reverse=True)

            # -------------------- set since and until time -----------------------
            now = datetime.now()
            past = now - timedelta(days=7)
            now = datetime.strptime(str(now).split(" ")[0], "%Y-%m-%d")
            past = datetime.strptime(str(past).split(" ")[0], "%Y-%m-%d")

            until_new = until
            since_new = since
            temp_until = datetime.strptime(until_new, "%Y-%m-%d")
            temp_since = datetime.strptime(since_new, "%Y-%m-%d")
            if (temp_until >= temp_since):
                # set until date
                if (temp_until > now and temp_since > now):
                    return None
                else:
                    if (now > temp_until):
                        until_new = until_new
                    else:
                        until_new = str(now).split(" ")[0]
                # set since date
                if (temp_until < past and temp_since < past):
                    return None
                else:
                    if (past < temp_since):
                        since_new = since_new
                    else:
                        since_new = str(past).split(" ")[0]
            else:
                return None
            # ---------------------------------------------------------------------

            # --------------------- if can't find data ------------------
            if (temp == []):
                #print(since_new, until_new, "DO IT",3)
                print(since_new, until_new, "DO IT")
                self.main_twitter(lang, count, keyword, since_new, until_new)
                return None
            # --------------------------------------------------------

            ######################### only Time period that programe can search #############################
            new_array = []
            end = None
            for k in temp:
                if (k <= now and k >= now - timedelta(days=7)):
                    new_array.append(k)
            #print(new_array,4)
            ##################################################################################################

            # -------------------------------- find starting time -------------------
            point = None
            if (datetime.strptime(until_new, "%Y-%m-%d") not in new_array):
                # บวก 1 วันเป็นช่วงอ้างอิงให้หาวันเมื่อวาน
                point = datetime.strptime(until_new,
                                          "%Y-%m-%d") + timedelta(days=1)
            else:
                point = datetime.strptime(until_new, "%Y-%m-%d")
            point = point.strftime("%Y-%m-%d")
            point = datetime.strptime(point, "%Y-%m-%d")
            #print(point,5)
            # -----------------------------------------------------------------------

            # ------------------------------- find ending time ---------------------
            if (since_new not in new_array):
                # กลับไปวันนึงคือการเอาวันพรุ่งนี้
                end = datetime.strptime(since_new,
                                        "%Y-%m-%d") - timedelta(days=1)
                new_array.append(end)
            #print(new_array,6)
            # ----------------------------------------------------------------------

            # ------------------------ find specific time --------------------------
            for point_stop in new_array:

                start = point - timedelta(days=1)
                stop = point_stop + timedelta(days=1)
                if (start >= stop):
                    start = str(start).split(" ")[0]
                    stop = str(stop).split(" ")[0]
                    print(start, stop, "DO IT")
                    self.main_twitter(lang, count, keyword, stop, start)
                else:
                    print(start, stop, "DO NOT DO IT")

                point = point_stop
            # ----------------------------------------------------------------------
        except IndexError:
            pass