def pull_msg_content_ecolog(msg_raw): re_subject = re.compile('(?<=Subject: \[ECOLOG-L\] ).*(?=List-Subscribe)') subject = re_subject.findall(msg_raw.get_payload()) subject = [x.replace("\\r\\n", "") for x in subject] if (len(subject) == 0): subject = ["error"] # print(subject) re_body = re.compile( '(?<=Content-Type: text\\/plain; charset="us-ascii"\\\\r\\\\nContent-Transfer-Encoding: quoted-printable).*?(?=Manage your Group settings)' ) body = re_body.findall(msg_raw.get_payload()) if len(body) == 0: re_body = re.compile( '(?<=Content-Type: text\\/plain; charset="iso-8859-1"\\\\r\\\\nContent-Transfer-Encoding: ).*?(?=Manage your Group settings)' ) body = re_body.findall(msg_raw.get_payload()) if len(body) == 0: re_body = re.compile( '(?<=Content-Type: text\\/plain; charset="UTF-8"\\\\r\\\\nContent-Transfer-Encoding: quoted-printable).*?(?=Manage your Group settings)' ) body = re_body.findall(msg_raw.get_payload()) if len(body) == 0: re_body = re.compile( '(?<=Content-Type: text\\/plain; charset="utf-8"\\\\r\\\\nContent-Transfer-Encoding: quoted-printable).*?(?=Manage your Group settings)' ) body = re_body.findall(msg_raw.get_payload()) if len(body) == 0: re_body = re.compile( '(?<=Content-Type: text\\/plain; charset="UTF-8"\\\\r\\\\nContent-Transfer-Encoding: ).*?(?=Manage your Group settings)' ) body = re_body.findall(msg_raw.get_payload()) if len(body) == 0: re_body = re.compile( '(?<=Content-Type: text\\/plain; charset="Windows-1252"\\\\r\\\\nContent-Transfer-Encoding: ).*?(?=Manage your Group settings)' ) body = re_body.findall(msg_raw.get_payload()) if len(body) == 0: body = "error" url = utils.extract_url(body) # some urls have an equal sign... body = [x.replace("=", "") for x in body] body = [x.replace("~", "") for x in body] body = [x.replace("--", "") for x in body] return [subject, body, url]
def submit(link): url = extract_url(link) process_test_url(url, 'test_features.csv') return_ans = tr.gui_caller('url_features.csv', 'test_features.csv') a = str(return_ans).split() if int(a[1]) == 0: return Results.SAFE # answer = tkMessageBox.askquestion("Redirect","Do you want to visit the url?") # if answer == 'yes': # webbrowser.open(url=E1.get(), new=1) elif int(a[1]) == 1: return Results.MALICIOUS # tkMessageBox.showinfo("URL Checker Result", "The URL " + url + " is Malicious") # answer_2 = tkMessageBox.askquestion("Redirect", "The url MALICIOUS, Do you still want to visit the url?") # if answer_2=='yes': # webbrowser.open(url=E1.get(),new=1) else: # tkMessageBox.showinfo("URL Checker Result", "The URL " + url + " is Malware") # tkMessageBox.showwarning("Warning","Cant Redirect, url contains a malware") return Results.MALWARE
def get_url_and_display_variant(update, context): user = context.chat_data['user'] search_url = utils.extract_url(update, context) if search_url is not None: logger.info("Bot extracted URL: %s", search_url) channel = utils.extract_domain(search_url) if channel in SUPPORTED_CHANNELS: update.message.reply_text(f"Brb! I'm learning more about this product on {channel}.") item_dict, variants_dict, variants_display_dict = utils.get_item_information(channel, search_url) update.message.reply_markdown(f"Hurray! Ive found \n\n{item_dict['item_name']}\n\n" 'Which of these product variations would you like to track?', reply_markup=ReplyKeyboardMarkup.from_column(variants_display_dict, one_time_keyboard=True)) logger.info(f"BOT: prompted {user.first_name} for variant choice") # Store in context context_store_item(item_dict, context) context.chat_data['item_url'] = utils.shorten_url([search_url])[0] context.chat_data['channel'] = utils.extract_domain(search_url) context.chat_data['variants'] = variants_dict logger.info(context.chat_data['variants']) context.chat_data['variants_displayed'] = variants_display_dict # context.chat_data['item'] = item_dict logger.info(f"CONTEXT: Stored channel, variants, display, url for item {item_dict['item_name']}") return CHOOSE_THRESHOLD else: update.message.reply_text(f"Oops, I do not support {channel} yet. Let's try again.", reply_markup=ReplyKeyboardMarkup(start_reply_keyboard, one_time_keyboard=True)) return INITIAL_CHOICE else: update.message.reply_text("Oops, you did not key in a valid URL. Let's try again.", reply_markup=ReplyKeyboardMarkup(start_reply_keyboard, one_time_keyboard=True)) return INITIAL_CHOICE
def in_white_list(url): domain, _ = extract_url(url) for d in ds: if d in domain: return True return False
def log(self, js_url, page_url): domain, uri = extract_url(page_url) d = self.getDomain(domain) page = d.getPage(domain+uri) page.logjs(js_url)
def to_absolute_url(url): ''' Converts urls like "/discover/" to "http://www.kickstarter.com/discover/" ''' return extract_url('{0}{1}'.format(ROOT_URL, url))
with open('password') as f: password = f.read() import sys args = sys.argv[1:] if len(args) != 1: print 'Usage: title.py folder' sys.exit() folder, = args context = imapclient.create_default_context() context.verify_mode = ssl.CERT_NONE imap = IMAPClient(host, ssl=True, ssl_context=context) imap.login(username, password) imap.select_folder(folder) msgids = imap.search() response = imap.fetch(msgids, ['BODY.PEEK[]']) messages = [] for msgid in msgids: header = response[msgid]['BODY[]'] message = email.message_from_string(header) messages.append(message) counter = collections.Counter() pool = multiprocessing.Pool() urls = [utils.extract_url(message) for message in messages] counter.update(pool.imap(utils.title_of_url, urls)) for title, count in counter.most_common(): print title, count
def test_extract_url(): assert utils.extract_url(['https://asdf.dd']) == 'https://asdf.dd'
def update(): try: db = utils.load_pickle(DB_PATH) last_update = sorted(db['date'])[-1] except: utils.download(DB_URL, DB_PATH) db = utils.load_pickle(DB_PATH) last_update = sorted(db['date'])[-1] # query arxiv api n_added = 0 indx = 0 while indx < MAX_ITER: url = BASE_URL + QUERY_FMT.format(DEF_QUERY, indx, RESULTS_PER_ITER) try: with urllib.request.urlopen(url, timeout=5.0) as url: response = url.read() except TimeoutError: continue response = feedparser.parse(response) for entry in response.entries: e = utils.encode_feedparser_dict(entry) paper_url = utils.parse_arxiv_url(e["link"]) date = e["published"] date = utils.convert_to_datetime(date) # content already in database if paper_url in db["url"]: if date <= last_update: indx = MAX_ITER break else: continue # retrieve and clean some text title = e["title"] title = utils.rem_tex_fmt(title) authors = ", ".join(f"{n['name']}" for n in e["authors"]) abstract = e["summary"] abstract = utils.rem_tex_fmt(abstract) other_urls = utils.extract_url(abstract) journal = e["arxiv_journal_ref"] if "arxiv_journal_ref" in e else "" journal = utils.rem_tex_fmt(journal) db["date"].append(date) db["url"].append(paper_url) db["title"].append(title) db["authors"].append(authors) db["abstract"].append(abstract) db["journal"].append(journal) db["other_urls"].append(other_urls) n_added += 1 if len(response.entries) == 0: utils.progress_bar(indx / MAX_ITER, status="API not responding. retrying...") if indx == MAX_ITER: utils.progress_bar(1) else: indx += 100 utils.progress_bar(indx / MAX_ITER, status=f"Fetching papers from {date}...") time.sleep(WAIT_TIME) print(f"{n_added} papers added to database") if True: indx = list(np.argsort(db["date"])) db["date"] = list(np.array(db["date"])[indx]) db["url"] = list(np.array(db["url"])[indx]) db["title"] = list(np.array(db["title"])[indx]) db["authors"] = list(np.array(db["authors"])[indx]) db["abstract"] = list(np.array(db["abstract"])[indx]) db["journal"] = list(np.array(db["journal"])[indx]) db["other_urls"] = list(np.array(db["other_urls"])[indx]) utils.save_pickle(DB_PATH, db) tkn_corpus = [] for indx in range(len(db["url"])): title = db["title"][indx].lower() abstract = utils.filter_abstract(db["abstract"][indx].lower()) tkn_corpus.append((title + " " + abstract).split(" ")) bm25 = BM25Okapi(tkn_corpus) utils.save_pickle(CACHE_BM25, bm25)
def save_content(url, title, content, evn): domain, uri = extract_url(url) d = Domain(domain, evn) d.update_content(uri, title, content)
def index(): form = Input(request.form) if request.method == 'POST' and form.validate(): url = form.url.data base_url = extract_url(url) domain_info = getDomainInfo(base_url)['WhoisRecord'] useful_domain_info = {} empty = False try: del domain_info['registrant']['rawText'] domain_info['registrant']['street'] = domain_info[ 'registrant'].pop('street1') useful_domain_info = { 'Registrar Name': domain_info['registrarName'], 'Registrant Details': domain_info['registrant'], 'Creation Date': domain_info['createdDate'], 'Updation Date': domain_info['updatedDate'], 'Expiration Date': domain_info['expiresDate'], 'Domain Name': base_url } except KeyError as e: if e.message == 'registrant': print 'REGISTRANT-------------------------------------------' useful_domain_info = {} empty = True elif e.message == 'street1': print 'STREET-------------------------------------------' useful_domain_info = { 'Registrar Name': domain_info['registrarName'], 'Registrant Details': domain_info['registrant'], 'Creation Date': domain_info['createdDate'], 'Updation Date': domain_info['updatedDate'], 'Expiration Date': domain_info['expiresDate'], 'Domain Name': base_url, } empty = False else: print('---------------ELSE------') print e.message except Exception as e: print('-----------EXCEPTION-------------') print(e.message) result = submit(url) verdict = '' if result == Results.SAFE: verdict = 'SAFE' elif result == Results.MALICIOUS: verdict = 'MALICIOUS' else: verdict = 'MALWARE' if db.session.query(Store).filter(Store.url == base_url).count() == 0: info = Store(base_url, verdict) db.session.add(info) db.session.commit() if result == Results.SAFE: print('----------- SAFE -----------') return render_template('safe.html', url=url, base_url=extract_url(url), info=useful_domain_info, isempty=empty) elif result == Results.MALICIOUS or result == Results.MALWARE: print('----------- MALICIOUS -----------') return render_template('malicious.html', url=url, base_url=extract_url(url), info=useful_domain_info, empty=empty) return render_template('index.html', form=form)