def syncCache(): cache1 = Cache(GIT_DIR) cache1.start() cache2 = Cache(GIT_DIR) cache2.initial() for path in cache2.list(): if not cache1.contains(path): cache1.update(path) if not isdir(join(CC_DIR, path.file)): copy(path.file) cache1.write()
class DNS(BaseServer): # кэширующий распознователь DNS def __init__(self, port, forwarder): super(DNS, self).__init__(port) self._forwarder = forwarder self._forwarder_corrupted = False self._cache = Cache() self._lock = Lock() def _client_req_handler(self, addr, packet): self._client = addr if not self._forwarder_corrupted: # если запрос был возвращен, то помечаем forwarder как corrupted if addr[0] == self._forwarder[0]: self._forwarder_corrupted = True print( '\n[-] Forwarder is corrupted (запрос был возвращен).\n' '[!] Пожалуйста, выключите сервер и укажите другой forwarder' ) self._return_server_resp(self._make_error_packet(packet)) else: self._return_server_resp(self._make_error_packet(packet)) question = self._get_question(packet) qname = get_qname(question) # тип вопроса qtype = struct.unpack('>H', question[question.find(b'\x00') + 1:][:2])[0] from_cache = False response = b'' if self._cache.contains(qname, qtype): with self._lock: response, from_cache = self._cache.get(qname, qtype, packet[:2]), True if response in [b'', None]: response, from_cache = self._request_to_forwarder( qname, qtype, packet), False if not self._forwarder_corrupted: print("\n" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), end=" ") print("-- [*] {} {} {}".format(addr[0], TYPES[qtype], qname), end=" ") print('cache' if from_cache else 'forwarder') self._return_server_resp(response) # получить секцию QUESTION def _get_question(self, packet): spacket = packet[12:] return spacket[:spacket.find(b'\x00') + 5] # ответ сервера def _return_server_resp(self, packet): self._sock.sendto(packet, self._client) # пакет ошибки def _make_error_packet(self, packet): flags = '1' + set_padding(bin(packet[2])[2:])[1:] rcode = set_padding(bin(packet[3])[2:]) return packet[:2] + struct.pack('>H', int(flags + rcode[:4] + '0010', 2)) + packet[4:] # запрос форвардеру def _request_to_forwarder(self, qname, qtype, packet): if packet is None: return with self._lock: error = False sock = self._make_socket() try: sock.sendto(packet, self._forwarder) npacket, addr = sock.recvfrom(BUFFER_SIZE) except socket.error: self._return_server_resp(self._make_error_packet(packet)) finally: sock.close() question = self._get_question(npacket) qnames = self._cache.push(qname, qtype, question, npacket) Thread(target=self.cache_inner_fields, args=(qnames, )).start() return npacket def _check_if_query(self, packet): return set_padding(bin(packet[3])[2:])[0] == '0' # кэширование данных def cache_inner_fields(self, qnames): for qname in qnames: if qname in [None, '']: continue for qtype in self._cache.used_qtypes: self._request_to_forwarder( qname, qtype, self.create_dns_request(qname, qtype)) # DNS звпрос def create_dns_request(self, name, _type): with self._lock: name = name.encode() # интерпретация байтов как упакованные двоичные данные id = struct.pack('>H', randint(MIN_PORT, MAX_PORT)) flags = b'\x01\x20' question = b'\x00\x01' answer = b'\x00\x00' authority = b'\x00\x00' addit = b'\x00\x00' qname = b'' for part in name.split(b'.'): qname += struct.pack('B', len(part)) + part qtype = struct.pack('>H', _type) qclass = b'\x00\x01' return id + flags + question + answer + authority + addit + qname + qtype + qclass
class DNS(BaseServer): ''' Cached DNS resolver ''' def __init__(self, port, forwarder): super(DNS, self).__init__(port) self._forwarder = forwarder self._forwarder_corrupted = False self._cache = Cache() self._lock = Lock() def _client_req_handler(self, addr, packet): print("In client request handler") self._client = addr if not self._forwarder_corrupted: if addr[0] == self._forwarder[0]: self._forwarder_corrupted = True print(CYCLE_MESSAGE) self._return_server_resp(self._make_error_packet(packet)) else: self._return_server_resp(self._make_error_packet(packet)) question = self._get_question(packet) qname = get_qname(question) qtype = struct.unpack('>H', question[question.find(b'\x00') + 1:][:2])[0] from_cache = False response = b'' if self._cache.contains(qname, qtype): with self._lock: response, from_cache = self._cache.get(qname, qtype, packet[:2]), True if response in [b'', None]: response, from_cache = self._make_request2forwarder( qname, qtype, packet), False if not self._forwarder_corrupted: print("\n" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), end=" ") print("-- [*] {} {} {}".format(addr[0], TYPES[qtype], qname), end=" ") print('cache' if from_cache else 'forwarder') self._return_server_resp(response) def _get_question(self, packet): spacket = packet[12:] return spacket[:spacket.find(b'\x00') + 5] def _return_server_resp(self, packet): self._sock.sendto(packet, self._client) def _make_error_packet(self, packet): flags = '1' + set_padding(bin(packet[2])[2:])[1:] rcode = set_padding(bin(packet[3])[2:]) return packet[:2] + struct.pack('>H', int(flags + rcode[:4] + '0010', 2)) + packet[4:] def _make_request2forwarder(self, qname, qtype, packet): if packet is None: return with self._lock: error = False sock = self._make_socket() try: sock.sendto(packet, self._forwarder) npacket, addr = sock.recvfrom(BUFFER_SIZE) except socket.error: self._return_server_resp(self._make_error_packet(packet)) finally: sock.close() question = self._get_question(npacket) qnames = self._cache.push(qname, qtype, question, npacket) if qtype == 1: ns_packet = self.switch_a_to_ns(packet, qname) sock = self._make_socket() try: sock.sendto(ns_packet, self._forwarder) nepacket, addr = sock.recvfrom(BUFFER_SIZE) except socket.error: self._return_server_resp(self._make_error_packet(packet)) finally: sock.close() question = self._get_question(nepacket) qnames = self._cache.push(qname, 2, question, nepacket) for auth_qname in qnames: ns_packet = self.replace_qname(packet, qname, auth_qname) sock = self._make_socket() try: sock.sendto(ns_packet, self._forwarder) nepacket, addr = sock.recvfrom(BUFFER_SIZE) except socket.error: self._return_server_resp( self._make_error_packet(packet)) finally: sock.close() question = self._get_question(nepacket) self._cache.push(auth_qname, 1, question, nepacket) # Thread(target=self.cache_inner_fields, args=(qnames,)).start() return npacket def replace_qname(self, packet, qname, new_qname): f_part = packet[:packet.find(bytes(qname.split('.')[0], 'utf8')) - 1] l_part = packet[packet.find(bytes(qname.split('.')[-2], 'utf8')) + len(qname.split('.')[-2]) + 1:] qwe = b'' try: for w in new_qname.split('.'): qwe += struct.pack('>B', len(w)) + bytes(w, 'utf8') except: import sys print(sys.exc_info()) return f_part + qwe + l_part def switch_a_to_ns(self, packet, qname): qname = qname.split('.')[-2] i = packet.find(bytes(qname, 'utf8')) + len(qname) + 2 packet = packet[:i] + b'\x02' + packet[i + 1:] return packet def _check_if_query(self, packet): return set_padding(bin(packet[3])[2:])[0] == '0' def cache_inner_fields(self, qnames): for qname in qnames: if qname in [None, '']: continue for qtype in self._cache.used_qtypes: self._make_request2forwarder( qname, qtype, self.create_dns_request(qname, qtype)) def create_dns_request(self, name, _type): with self._lock: name = name.encode() id = struct.pack('>H', randint(MIN_VALUE, MAX_VALUE)) flags = b'\x01\x20' question = b'\x00\x01' answer = b'\x00\x00' authority = b'\x00\x00' addit = b'\x00\x00' qname = b'' for part in name.split(b'.'): qname += struct.pack('B', len(part)) + part qtype = struct.pack('>H', _type) qclass = b'\x00\x01' return id + flags + question + answer + authority + addit + qname + qtype + qclass
class Bing_Search(object): def __init__(self, api_key, data_dir=None): self.cache = None if data_dir: cache_file = data_dir + "/bing.json" self.cache = Cache(cache_file) self.stopext = set([".pdf", ".doc", ".xls"]) self.headers = {'Ocp-Apim-Subscription-Key': api_key} def is_valid(self, url): if len(url)<4 or url[-4:] in self.stopext: return False return True def search(self, query_term, count=10): """ Reference: https://docs.microsoft.com/en-us/rest/api/cognitiveservices/bing-web-api-v5-reference#query-parameters Args: count: The number of search results to return in the response. If count is greater than 50, paging will be used to fetch the results since maximum results of each query is 50 """ if self.cache and self.cache.contains(query_term): urls = self.cache.get(query_term) return [url for url in urls if self.is_valid(url)] urls = [] offset = 0 while count>0: params = urllib.urlencode({ # Request parameters 'q': query_term, 'count': str(min(count, 50)), 'offset': str(offset), 'mkt': 'en-us', 'safesearch': 'Moderate'}) try: conn = httplib.HTTPSConnection('api.cognitive.microsoft.com') #conn.request("GET", "/bing/v5.0/search?%s" % params, "{body}", headers) conn.request("GET", "/bing/v7.0/search?%s" % params, "{body}", self.headers) response = conn.getresponse() data = response.read() obj = json.loads(data) if 'webPages' in obj: webPages = obj['webPages'] values = webPages['value'] for value in values: if self.is_valid(value['url']): url = URLUtility.normalize(value['url']) if url: urls.append(url) conn.close() except: traceback.print_exc() count -= 50 offset += 1 if self.cache: self.cache.add(query_term, urls) return urls def search_site(self, keyword, url, k=10): """ Search inside a given website using the search command: "keyword site:url" Parameters keyword: keyword used to search url: top level domain Returns list of urls """ keyword = keyword + " site:" + url return self.search(keyword, k)
class Search_APIs(object): def __init__(self, data_dir, fetcher): google_api_key = "" if not google_api_key: print "Error! google_api_key is missing" sys.exit(1) google_cse_id = "" # Google custome search engine id if not google_cse_id: print "Error! google_cse_id is missing" sys.exit(1) self.google = Google_Search(google_api_key, google_cse_id) self.google_delay = 1 # 5QPS limit: https://developers.google.com/webmaster-tools/search-console-api-original/v3/limits bing_api_key = "" if not bing_api_key: print "Error! bing_api_key is missing" sys.exit(1) self.bing = Bing_Search(bing_api_key) self.bing_delay = 1 # Setting cache for related search related_cache_file = data_dir + "/related_search.json" self.related_cache = Cache(related_cache_file) print "Loaded ", self.related_cache.length( ), " queries from related search cache" # Setting cache for backlink search access_id = "" if not access_id: print "Error! access_id is missing" sys.exit(1) secret_key = "" if not secret_key: print "Error! secret_key is missing" sys.exit(1) self.moz = Moz_Search(access_id, secret_key) backlink_cache_file = data_dir + "/backlink_search.json" self.backlink_cache = Cache(backlink_cache_file) print "Loaded ", self.backlink_cache.length( ), " queries from backlink search cache" self.moz_delay = 1 # Setting cache for keyword search keyword_cache_file = data_dir + "/keyword_search.json" self.keyword_cache = Cache(keyword_cache_file) print "Loaded ", self.keyword_cache.length( ), " queries from keyword search cache" # Setting cache for forward search #self.fetcher = Fetcher(data_dir, "/forward_search.json") self.fetcher = fetcher self.link_extractor = Link_Extractor() self.k = 10 # Number of keywords selected in each extraction self.max_urls = 10 # maximum number of urls to extract from each pages self.keywords = set() # Keywords extracted from relevant sites def set_max_keywords(self, max_kw): self.k = max_kw def _extract_keywords(self, sites, k=10): """ Extract top k most frequent keywords. Skip ones that were selected. """ stop = stopwords.words('english') counter = Counter() for site in sites: for p in site: text = p.get_text('meta') text = URLUtility.clean_text(text) words = nltk.word_tokenize(text) words = [ word for word in words if word not in stop and len(word) > 2 ] bigram_words = [ words[i] + ' ' + words[i + 1] for i in xrange(len(words) - 1) ] counter += Counter(words + bigram_words) # Get the topk words """ counter = [(counter[w], w) for w in counter if counter[w]>1] # convert to array heapq.heapify(counter) topk = heapq.nlargest(k, counter) return [w[1] for w in topk] """ top_words = counter.most_common(k + len(self.keywords)) result = [] # list of keywords to return i = 0 while len(result) < k and i < len(top_words): if top_words[i][0] not in self.keywords: result.append(top_words[i][0]) self.keywords.add(top_words[i][0]) i += 1 print " List of selected keywords: ", result return result def search(self, sites, searchop, seed_keyword="", max_results=50): """ Args: max_results: Maximum number of results to return in Bing/Google search search: str - potential values: 'rl', 'kw', 'fw', 'bl' """ #sites = self.fetcher.fetch_sites(urls) results = set() if searchop == 'rl': for w in sites: print " Running related search..." urls = self.search_related(w.get_host(), max_results) results.update(urls) elif searchop == 'bl': """ for w in sites: print " Search backlinks..." urls = self.search_backward_forward(w.get_host()) results.update(urls) """ urls = self.search_backward_forward_batch(sites) results.update(urls) elif searchop == 'fw': #urls = [w.get_url() for w in sites] print " Forward search...", len(sites), " urls" urls = self.search_forward_sites(sites) results.update(urls) # Run keyword search elif searchop == 'kw': print " Searching by keyword" keywords = self._extract_keywords(sites, self.k) for keyword in keywords: if seed_keyword: keyword = seed_keyword + ' ' + keyword urls = self.search_keywords(keyword, max_results, se='bing') results.update(urls) print " Found ", len(results), " urls" return results def search_backward_forward(self, url): """ Search related pages using backlink search and forward search Returns: - list of urls (potentially duplicated) """ t = time.time() backlinks = self.search_backward(url) print "Backlink search time: ", time.time() - t t = time.time() fwlinks = self.search_forward(backlinks) print "Forward search time: ", time.time() - t return backlinks + fwlinks def search_backward_forward_batch(self, sites): """ Search related pages using backlink search and forward search Parameters: - sites: list of Website objects Returns: - list of urls (potentially duplicated) """ t = time.time() backlinks = set() for site in sites: backlinks.update(self.search_backward(site.get_host())) backlinks = list(backlinks) print "Backlink search time: ", time.time() - t t = time.time() fwlinks = self.search_forward(backlinks) print "Forward search time: ", time.time() - t return backlinks + fwlinks def search_backward(self, url): """ Search backlinks using MOZ APIs Returns: - list of urls """ if self.backlink_cache.contains(url): results = self.backlink_cache.get(url) print "hit backlink query: ", url else: #time.sleep(self.moz_delay) results = self.moz.search_backlinks(url) self.backlink_cache.add(url, results) print "Backlink Search - Query: ", url, " - Number of results: ", len( results) return results def search_keywords(self, keyword, max_results, se='google'): """ Search relevant pages by keyword using Google Args: max_results: maximum number of results to return """ urls = [] if self.keyword_cache.contains(keyword): urls = self.keyword_cache.get(keyword) print "hit keyword query: ", keyword else: if se == 'google': time.sleep(self.google_delay) urls = self.google.search(keyword, max_results) else: # default: 'bing' time.sleep(self.bing_delay) urls = self.bing.search(keyword, max_results) self.keyword_cache.add(keyword, urls) """ if 'items' in results: for item in results['items']: urls.append(url_normalize(item['link'])) """ print "Keyword Search - Query: ", keyword, " - Number of results: ", len( urls) return urls def search_forward_sites(self, sites, insite=False): """ Fetch the pages and extract external links. Args - sites: list of Website objects - insite: False if extracting links outside the host. """ outlinks = set() for site in sites: for page in site: if insite: links = self.link_extractor.extract_insite_links( page.get_url(), page.get_html()) else: links = self.link_extractor.extract_external_links( page.get_url(), page.get_html()) links = self.select_subset(links) outlinks.update(links) print "Forward Search ", " - Number of results: ", len(outlinks) return list(outlinks) def search_forward(self, urls, insite=False): """ Fetch the pages and extract external links Args - urls: list of urls - insite: False if extracting links outside the host. """ sites = self.fetcher.fetch_sites(urls, allow_fetch_later=True) outlinks = set() for site in sites: for page in site: if insite: links = self.link_extractor.extract_insite_links( page.get_url(), page.get_html()) else: links = self.link_extractor.extract_external_links( page.get_url(), page.get_html()) links = self.select_subset(links) outlinks.update(links) print "Forward Search ", " - Number of results: ", len(outlinks) return list(outlinks) def select_subset(self, urls): """ Each page might contain thousand of external urls which pollute the results, so we only keep a fixed number of links from each page How this works: - Pick one url in each site - If not yet reaching max, select random urls Returns: - list of urls """ if len(urls) <= self.max_urls: return urls results = [] """ cur = urls while len(results)<self.max_urls: sites = set() next = [] for url in cur: site = URLUtility.get_host(url) if site not in sites: sites.add(site) results.append(url) else: next.append(url) if len(results) == self.max_urls: break cur = next """ sites = set() for url in urls: site = URLUtility.get_host(url) if site not in sites: sites.add(site) results.append(url) if len(results) == self.max_urls: break return results def search_related(self, url, k): """ Return list of related urls using Google related search """ query = "related:" + url urls = [] if self.related_cache.contains(query): urls = self.related_cache.get(query) print "hit related query: ", query else: time.sleep(self.google_delay) urls = self.google.search(query, k) self.related_cache.add(query, urls) """ urls = [] if 'items' in results: for item in results['items']: urls.append(url_normalize(item['link'])) """ print "Related Search - Query: ", url, " - Number of results: ", len( urls) return urls
class NewsSentiment: __positiveList = [] __negativeList = [] __stopwords = [] def __init__(self): self.__cache = Cache("news-sentiment.json") self.__cityCache = Cache("city-cache.json") with open("positive.txt") as pFile: self.__positiveList = pFile.read().lower().split() with open("negative.txt") as nFile: self.__negativeList = nFile.read().lower().split() with open("stopwords.txt") as sFile: self.__stopwords = sFile.read().lower().split() self.__key = "4e28e4b30b954544b5d808b4d54b37a4" self.__positiveCount = 0 self.__negativeCount = 0 self.__stopwordCount = 0 with open("news-id.json") as nifile: self.__news_id = json.load(nifile) self.__news_id_str = ','.join(map(str, self.__news_id)) def __find(self, needle, haystack): found = [] h_len = len(haystack) n_len = len(needle) for i in range(0, h_len): str_part = "" innerLen = 0 if i + n_len <= h_len: innerLen = i + n_len else: break for j in range(i, innerLen): str_part += haystack[j] h1 = hash(needle + "_salt") % 1997 h2 = hash(str_part + "_salt") % 1997 if h1 == h2: found.append(i) return found def calculate_polarity(self, word): polarity = 0 parts = word.split() p = 0 n = 0 s = 0 for part in parts: part = part.strip().lower() if part in self.__positiveList: polarity += 1 p += 1 elif part in self.__negativeList: polarity -= 1 n += 1 elif part in self.__stopwords: s += 1 return polarity, p, n, s def prefetch_news_city(self, cities): for city in cities: self.fetch_news_sentiment(city) def fetch_news_sentiment(self, country): country = country.lower() api_id = ','.join(self.__news_id) print("Analyzing news for ", country, "...") sentiments = [] if not self.__cityCache.contains(country): url = "https://newsapi.org/v2/everything?q=" + country + "&sources=" + api_id + "&apiKey=" + self.__key country_news = requests.get(url).json() self.__cityCache.set( country, { "articles": country_news["articles"], "totalResults": country_news["totalResults"] }) cache_country = self.__cityCache.get(country) articles = cache_country["articles"] res_count = cache_country["totalResults"] _len = 0 if res_count > 6: _len = 6 else: _len = res_count for i in range(0, _len): article = articles[i] title = article["title"] url = article["url"] key = url if not self.__cache.contains(key): try: news = Article(url) news.download() news.parse() title_score, tp, tn, ts = self.calculate_polarity(title) news_score, np, nn, ns = self.calculate_polarity(news.text) sentiment = { "title": title_score, "news": news_score, "total": (title_score + news_score), "stats": { "p": tp + np, "n": np + nn, "s": ts + ns } } self.__cache.set(key, sentiment) except: print("Unable to fetch news for ", country, "! Msg: ", ex) sentiment = self.__cache.get(key) self.__positiveCount += sentiment["stats"]["p"] self.__negativeCount += sentiment["stats"]["n"] self.__stopwordCount += sentiment["stats"]["s"] sentiments.append(sentiment) return sentiments def showWordStatistics(self): sentiment_plotter.plotResults(self.__positiveCount, self.__negativeCount, self.__stopwordCount)
NewsSentiment().prefetch_news_city(cities) print("Plotting cities on map...") if not os.path.exists("static\\city_on_map.html"): mark_cities.plot_map_with_marked_cities(cities, "static\\city_on_map.html") print("Building graph for cities...") graphCities = MapGraph() print("Getting coordinates for each cities...") for city in cities: print("Acquiring coordinates for ", city, "...") k_city = city.replace(" ", "_").lower() if city_coord.contains(k_city): print("Coordinate exist!") else: coordinate = city_mapping.convertCityToCoordinates(city) print(city, " is at ", coordinate) city_coord.set(k_city, coordinate) for fromCity in cities: for toCity in cities: k_from = fromCity.replace(" ", "_").lower() k_to = toCity.replace(" ", "_").lower() distance = None if fromCity == toCity: print("Same origin and destination. Distance = 0") distance = 0