def encrypt(seed, sens_priv_num, c_factor, sensor_data): data_size = np.size(sensor_data) sensing_matrix = Utility.generate_sensing_matrix( seed, round((1 - c_factor) * data_size), data_size) compressed_data = np.dot(sensing_matrix, sensor_data) return compressed_data * Utility.generate_secret_value( seed, sens_priv_num)
class Logger(object): "Class for logging tweet data" def __init__(self, messageFileName="default"): self.messageFileName = messageFileName self.messageLogger = csv.writer(open( self.messageFileName, 'wb')) self.messageLogger.writerow(["time", "code", ""]) self.tweetLogger = csv.writer(open("tweets.csv", 'a')) self.util = Utility() self.time = self.util.currentTimeSeconds() self.timeLogger = csv.writer(open("timing.csv", 'wb')) self.timeLogger.writerow(['impl', 'execTime', 'highestEmo', 'currentTime']) def logMessage(self, code, message): self.time = self.util.currentTimeMillis() self.messageLogger.writerow([self.time, code, message]) print(str(self.time) + "," + str(code) + "," + str(message) + "\n") def logTweet(self, tweet): self.time = self.util.currentTimeMillis() # strip out weird chracters preventing the csv to be written tweetText = tweet.text cleanText = filter(lambda x: x in string.printable, tweetText) exclude = set([',', ';']) cleanText = ''.join(ch for ch in cleanText if ch not in exclude) self.messageLogger.writerow([self.util.currentTimeSeconds(), tweet.created_at, cleanText, tweet.lang, tweet.location]) def logTiming(self, qualifier, execTime, highestEmo): self.time = self.util.currentTimeMillis() self.timeLogger.writerow([qualifier, execTime, highestEmo, self.time])
def __init__(self, search_rel_question_doc_alg_str="BM25F"): if search_rel_question_doc_alg_str == "TF_IDF": self.search_alg = TF_IDF elif search_rel_question_doc_alg_str == "Frequency": self.search_alg = Frequency else: self.search_alg = BM25F Utility.set_stem()
def _on_page(self, page, domain): self.logger.info("Searching email address on page") clean_html = Utility.clean_html(str(page)) emails = re.findall(cfg.email_regex, clean_html, re.I) if emails: emails = map(Utility.normalize_email, emails) return emails
def search_email_in_domain(self, domain): self.check_driver() try: self._go_to_page(domain) soup = BS(self.driver.page_source, "lxml") # Find email on page and on link _email_founds = [] for pos in POSSIBLE_POSITION: action_list = { 'on_page': self._on_page, 'on_link': self._on_link } email = action_list.get(pos)(soup, domain) _email_founds.append(email) email_candidates = Utility.flatten_list(_email_founds) if str(domain).endswith('.id') or str(domain).endswith('.id/'): emails = self.search_id_domain(domain) if emails not in email_candidates: email_candidates += emails if not email_candidates: # If email not found self.logger.info('Email not found on domain %s', domain) # Find it using whois return [] else: # If email found, filter it final_candidates = self._filter_email_candidates( email_candidates) return self.sort_email(final_candidates, domain) except Exception as exc: print "Error on domain {} {} ".format(domain, str(exc)) return []
def DiffImage(self: Any, source: Dict[str, Any]) -> None: """Diff the provided image data source.""" filename: str = source["filename"] url: str = source["url"] allowRevert: bool = source.get("allowRevert", True) # Append the current timestamp to the end of the URL as an # attempt to prevent the Discord CDN from serving previously # cached versions of an image. timestamp: str = str(int(datetime.utcnow().timestamp())) imageUrl: str = f"{url}?{timestamp}" older: Dict[str, Any] = source["older"] old: Dict[str, Any] = source["old"] new: Dict[str, Any] = source["new"] if old["raw"] == new["raw"]: logger.info(f"No difference found in {filename} ({url})") return elif (allowRevert is False) and (older["raw"] == new["raw"]): logger.info(f"Ignored revert found in {filename} ({url})") return source["urlTrim"] = Utility.Truncate(self, url, 256) old["size"] = Utility.Base64Size(self, old["raw"]) new["size"] = Utility.Base64Size(self, new["raw"]) success: bool = SitRep.Notify( self, { "title": source["urlTrim"], "description": None, "url": url, "filename": source["filename"], "imageUrl": imageUrl, "size": Utility.CountRange(self, new["size"], old["size"]) + " bytes", "diffUrl": source["old"]["gist"].html_url + "/revisions", }, ) # Ensure no changes go without notification if success is True: Utility.UpdateGist(self, source)
def _find_keyword_in_url(self, links, domain): # Get all url and normalize it normalized_links = self._normalize_elems(links) # Filter url that doesn't contain keyword candidate_links = \ filter(lambda x: self._is_contain_keyword(x.replace(Utility.find_domain_name(domain), '')), normalized_links) return candidate_links
def __init__(self, messageFileName="default"): self.messageFileName = messageFileName self.messageLogger = csv.writer(open( self.messageFileName, 'wb')) self.messageLogger.writerow(["time", "code", ""]) self.tweetLogger = csv.writer(open("tweets.csv", 'a')) self.util = Utility() self.time = self.util.currentTimeSeconds()
def filter_train_dataset(train_df): filtered_indices = [] for question_id in tqdm(train_df.question_id.unique(), total=train_df.question_id.nunique()): train_df_part = train_df[train_df.question_id == question_id] question = train_df_part.question_lem.values[0] sentences = train_df_part.sentence_lem.values filtered = Utility.filter_by_question_sentence_words_intersect(question, sentences) filtered_indices.extend(np.array(train_df_part.index)[np.array(filtered)]) train_df_filtered = train_df[~train_df.index.isin(filtered_indices)] return train_df_filtered
def sort_email(self, emails, domain): # If this is not governor's domain, do not get any email candidate with .go.id domain name if '.go.id' not in domain: emails = [email for email in emails if '.go.id' not in emails] domain_name = Utility.find_domain_name(domain) emails = map(lambda email: (email, domain_name), emails) # Sort based on score descending emails.sort(cmp=lambda a, b: -1 if self.email_scoring(a) > self.email_scoring(b) else 0) emails = [x for x, y in emails] return emails[:cfg.max_email]
def save(): try: print("User %s logged in!" % request.form['fname']) db = mysql.connector.connect(host=host, user=user, password=password, database=database) cursor = db.cursor(buffered=True) util = Util() status, uid = util.saveUser(db, cursor, request) if status and status == 200: return render_template("thankyou.html", fname=request.form['fname'], uid=uid) raise Exception("Unable to insert data!") except Exception as e: print(json.dumps({"error": str(e)})) return "<h1>Oops! Something went wrong.. Could you try after sometime or reach out to the host!</h1>" finally: db.close() cursor.close()
def _on_link(self, page, domain): self.logger.info("Search email address on link to another page") _email_founds = [] # Find all possible link element links = page.findAll('a') # Find all candidate link with keyword on html page keyword_html_link = self._find_keyword_in_html_text(links) # Find all candidate link with keyword on url keyword_url_link = self._find_keyword_in_url(links, domain) # Merge the url result, remove duplicate url candidate_links = Utility.uniquify(keyword_html_link + keyword_url_link) # Check for invalid url and try to fix it invalid_url = [ uri for uri in candidate_links if not cfg.url_regex.match(uri) ] try_fix_invalid_url = map( lambda _uri: Utility.normalize_invalid_url(_uri, domain), invalid_url) # Filter invalid url candidate_links = candidate_links + try_fix_invalid_url candidate_links = Utility.uniquify( [_uri for _uri in candidate_links if cfg.url_regex.match(_uri)]) try: for link in candidate_links: self.logger.info("Go to next link: " + link) try: self._go_to_page(link) except Exception, err: print str(err) continue soup = BS(self.driver.page_source, "lxml") email = self._on_page(soup, domain) _email_founds.append(email) return _email_founds if not _email_founds else Utility.flatten_list( _email_founds)
def create_database(self, df, database_origin_dir='../data/database_origin', database_lem_dir='../data/database_lem' ): if not os.path.exists(database_origin_dir): os.mkdir(database_origin_dir) if not os.path.exists(database_lem_dir): os.mkdir(database_lem_dir) for paragraph in tqdm(df.paragraph.unique(), total=df.paragraph.nunique()): paragraph_id = (df[df.paragraph == paragraph].paragraph_id.values[0]) with open("{}/{}.txt".format(database_origin_dir, paragraph_id), 'w') as fout: fout.write(paragraph) txt_lemm = Utility.lemmatize(paragraph) with open("{}/{}.txt".format(database_lem_dir, paragraph_id), 'w') as fout: fout.write(txt_lemm)
def Initialize(self: Any) -> None: """Initialize SitRep and begin primary functionality.""" logger.info("SitRep") logger.info("https://github.com/EthanC/SitRep") self.config: Dict[str, Any] = SitRep.LoadConfig(self) SitRep.SetupLogging(self) self.git: Github = Utility.GitLogin(self) for source in self.config["dataSources"]: SitRep.ProcessDataSource(self, source) logger.success("Finished processing data sources")
def main(): '''Main function of the script''' paused = False logger = CliLogger() screen = Screen() resources = Resources() analytics = Analytics(logger) cooldown = Cooldown(COOLDOWNS) analytics.ignore = ANALYTICS_IGNORE resources.load(analytics) utility = Utility(logger, screen, resources, analytics, cooldown) logic = Logic(utility) try: handle = wait_league_window(logger, (0, 0, 1024, 768)) except CantForgroundWindowError: pass logger.log('Press and hold x to exit bot.') screen.d3d.capture(target_fps=10, region=find_rect(handle)) while True: try: if keyboard.is_pressed('x'): raise BotExitException if keyboard.is_pressed('ctrl+u'): paused = False if paused: time.sleep(0.1) continue if keyboard.is_pressed('ctrl+p'): paused = True logger.log( 'Bot paused. Press ctrl+u to unpause. Press x to exit.') continue logic.tick() time.sleep(random.randint(*TICK_INTERVAL) / 1000) except BotContinueException as exp: time.sleep(random.randint(*exp.tick_interval) / 1000) except NoCharacterInMinimap: time.sleep(1) except BotExitException: screen.d3d.stop() break except Exception: # pylint:disable=broad-except traceback.print_exc() screen.d3d.stop() break
def _filter_email_candidates(candidates): # Remove duplicate element candidates = Utility.uniquify( map(lambda email: str(email).strip().lower(), [] if not candidates else candidates)) # Filter email that contain blacklist word candidates = filter( lambda email: not re.match(cfg.get_blacklist_regex(), email), candidates) # Filter short email candidates = [ candidate for candidate in candidates if len(candidate) > 5 ] # Filter email that contain newline and space candidates = [ candidate for candidate in candidates if '\n' not in candidate and ' ' not in candidate and '\t' not in candidate ] return candidates
def create_train_dataset(errors, data_dir='../notebooks/bm25f', database_dir="../data/database_origin"):# polyglot works with origin text to split into sentences df_dict = {} for f in tqdm(os.listdir(data_dir)): question_id = int(f.split('.')[0]) if f.endswith('.npy') and question_id not in errors: res = {} for doc_number, doc_id in enumerate(np.load('{}/{}'.format(data_dir, f))): with open("{}/{}.txt".format(database_dir, doc_id)) as fin: res[doc_number] = Utility.sentence_splitter(fin.read()) df_dict[question_id] = res df_with_list_of_docs = pd.DataFrame.from_records(df_dict).T df_with_list_of_sentences = pd.DataFrame() for col in df_with_list_of_docs.columns: df_per_doc = df_with_list_of_docs.apply(lambda x: pd.Series(x[col]), axis=1).stack().reset_index(level=1, drop=True).to_frame() df_per_doc['doc_number'] = col df_with_list_of_sentences = pd.DataFrame.append( df_with_list_of_sentences, df_per_doc ) df_with_list_of_sentences = df_with_list_of_sentences.reset_index() df_with_list_of_sentences.columns = ['question_id', 'sentence', 'doc_number'] return df_with_list_of_sentences
"avatar_url": self.config["discord"]["avatarUrl"], "embeds": [ { "title": embed.get("title"), "description": embed.get("description"), "url": embed.get("url"), "timestamp": datetime.utcnow().isoformat(), "color": int("66BB6A", base=16), "footer": { "text": embed.get("filename"), }, "image": {"url": embed.get("imageUrl")}, "author": { "name": "SitRep", "url": "https://github.com/EthanC/SitRep", "icon_url": "https://i.imgur.com/YDZgxh2.png", }, "fields": fields, } ], } return Utility.POST(self, self.config["discord"]["webhookUrl"], payload) if __name__ == "__main__": try: SitRep.Initialize(SitRep) except KeyboardInterrupt: exit()
def decrypt(seed, sensor_priv_num, org_data_len, n_nonzero_coefs, enc_data): enc_data /= Utility.generate_secret_value(seed, sensor_priv_num) enc_data_size = np.size(enc_data) sensing_matrix = Utility.generate_sensing_matrix(seed, enc_data_size, org_data_len) omp = orthogonal_mp(sensing_matrix, enc_data, n_nonzero_coefs = n_nonzero_coefs) return omp
base_dir = str(Path.home()) default_args = { 'owner': 'user', 'depends_on_past': False, 'start_date': dt.datetime.strptime('2018-07-29T00:00:00', '%Y-%m-%dT%H:%M:%S'), 'provide_context': True } # Instantiate the DAG dag = DAG('dag1', default_args=default_args, schedule_interval='0 0 * * *', max_active_runs=1) # scheduled to run everyday at midnight util = Utility(news_api_key='', s3_bucket='') # get all sources in english language def sources(**kwargs): #sourcesCsvString=util.getSources('business','en','in') sourcesCsvString = util.getSources(language='en') return sourcesCsvString # get top headlines for list of sources given def headlines(**kwargs): ti = kwargs['ti'] v1 = ti.xcom_pull(task_ids='gettingsources' ) # xcom pull used to get values from the sources task csvFilesList = util.getheadlines(v1)
#!/usr/bin/python3 from pathlib import Path import os import sys libPath = os.path.join(Path(__file__).absolute().parent.parent, 'pyuval/') sys.path.append(libPath) from config import Config from utils import Utility util = Utility() config = Config() memoryData = util.readDataFromMemoy(config.get('ksm:gpg:shm:address')) print(memoryData)
parser.add_argument("-l", "--list", help="list all yubikeys in database", action="store_true", default=False, dest="list") parser.add_argument("-s", "--subsystem", help="Subsystem to apply action [client, yubikey]", default=None, type=str, dest="subsystem") args = parser.parse_args() util = Utility() config = Config() log = PyuvalLogging(ModuleName="ManagePyuval") if 'SUDO_USER' in os.environ: user = os.environ['SUDO_USER'] else: user = os.environ['USER'] ksmDb = Database(**config.get('ksm:db')) if args.subsystem == "yubikey": if args.add: serial = input('Serial: ').strip() username = input('Username: '******'Public ID: ').strip()
def get_base_stats(question, sentences, question_lem, sentences_lem, idfs, idfs_lem): unique_word_count_scores, unique_word_percent_scores, sentence_len, bm25f_scores, tf_idf_scores = Utility.stats(question, sentences, idfs) unique_lem_word_count_scores, unique_lem_word_percent_scores, sentence_lem_len, bm25f_lem_scores, tf_idf_lem_scores = Utility.stats(question_lem, sentences_lem, idfs_lem) s = pd.Series([ unique_word_count_scores, unique_lem_word_count_scores, unique_word_percent_scores, unique_lem_word_percent_scores, sentence_len, sentence_lem_len, bm25f_scores, bm25f_lem_scores, tf_idf_scores, tf_idf_lem_scores, sentences, sentences_lem, ]) return pd.DataFrame.from_items(zip(s.index, s.values))
def training(args): source = EventField(fix_length=args.event_size, embed_size=args.src_embed) mask_flag = 'tmpl' in args.net sentence_size = args.sentence_size if args.truncate else None reverse_decode = args.reverse_decode if 'disc' in args.net: target = TextAndContentWordField(start_token=None, fix_length=sentence_size, mask_player=mask_flag, mask_team=mask_flag, numbering=args.numbering, reverse=reverse_decode, bpc=args.bpc, multi_tag=args.multi_tag) else: target = TextField(start_token=None, fix_length=sentence_size, mask_player=mask_flag, mask_team=mask_flag, numbering=args.numbering, reverse=reverse_decode, bpc=args.bpc, multi_tag=args.multi_tag) if args.truncate: train = OptaDataset(path=args.dataset + '.train', fields={ 'source': source, 'target': target }) else: train = OptaDataset(path=args.dataset + '.train', fields={ 'source': source, 'target': target }, limit_length=args.limit) source.build_vocabulary(train.source) target.build_vocabulary(train.target, size=args.vocab_size) target.player_to_id = source.player_to_id target.players = source.id_to_player if mask_flag or 'disc' in args.net: content_word_to_id = getattr(target, 'content_word_to_id', None) target_test = TestTextField(source.id_to_player, source.id_to_team, target.word_to_id, content_word_to_id, target.unk_id, fix_length=None, bpc=args.bpc) else: target_test = TextField(start_token=None, end_token=None, fix_length=None, bpc=args.bpc) target_test.word_to_id = target.word_to_id target_test.id_to_word = target.id_to_word target_test.unk_id = target.unk_id dev = OptaDataset(path=args.dataset + '.dev', fields={ 'source': source, 'target': target_test }, limit_length=args.limit) train2 = OptaDataset(path=args.dataset + '.train', fields={ 'source': source, 'target': target_test }, limit_length=args.limit) test = OptaDataset(path=args.dataset + '.test', fields={ 'source': source, 'target': target_test }) test20 = OptaDataset(path=args.dataset + '.test', fields={ 'source': source, 'target': target_test }, limit_length=20) test15 = OptaDataset(path=args.dataset + '.test', fields={ 'source': source, 'target': target_test }, limit_length=15) test10 = OptaDataset(path=args.dataset + '.test', fields={ 'source': source, 'target': target_test }, limit_length=10) start_id, end_id = target.word_to_id['<s>'], target.word_to_id['</s>'] class_weight = compute_class_weight('./dataset/player_list.txt', target.word_to_id, args.class_weight[0], args.class_weight[1], gpu=args.gpu) dirname = Utility.get_save_directory( args.net, './debug' if args.debug else args.output) if args.debug: save_path = os.path.join('./debug', dirname) else: save_path = os.path.join(args.output, dirname) Utility.make_directory(save_path) del args.vocab_size setting = { 'vocab_size': len(target.word_to_id), 'type_size': len(source.type_to_id), 'player_size': len(source.player_to_id), 'team_size': len(source.team_to_id), 'detail_size': len(source.detail_to_id), 'detail_dim': source.details_dimention, 'start_id': start_id, 'end_id': end_id, 'unk_id': target.unk_id, 'save_path': save_path, **vars(args) } dump_setting(setting, os.path.join(save_path, 'setting.yaml')) home_player_tag = target.word_to_id.get(target.home_player_tag) away_player_tag = target.word_to_id.get(target.away_player_tag) home_team_tag = target.word_to_id.get(target.home_team_tag) away_team_tag = target.word_to_id.get(target.away_team_tag) print('vocab size: {}'.format(len(target.word_to_id))) if args.net == 'plain': model = MLPEncoder2AttentionDecoder(len(source.type_to_id), len(source.player_to_id), len(source.team_to_id), len(source.detail_to_id), source.details_dimention, args.src_embed, args.event_size, len(target.word_to_id), args.trg_embed, args.hidden, start_id, end_id, class_weight, args.mlp_layers, args.max_length, args.dropout, IGNORE_LABEL, reverse_decode=reverse_decode) elif args.net == 'tmpl': model = MLPEncoder2AttentionDecoder(len(source.type_to_id), len(source.player_to_id), len(source.team_to_id), len(source.detail_to_id), source.details_dimention, args.src_embed, args.event_size, len(target.word_to_id), args.trg_embed, args.hidden, start_id, end_id, class_weight, args.mlp_layers, args.max_length, args.dropout, IGNORE_LABEL, source.id_to_player, home_player_tag, away_player_tag, source.id_to_team, home_team_tag, away_team_tag, target.player_to_id, target.players, reverse_decode=reverse_decode) elif args.net == 'gate': model = MLPEncoder2GatedAttentionDecoder(len(source.type_to_id), len(source.player_to_id), len(source.team_to_id), len(source.detail_to_id), source.details_dimention, args.src_embed, args.event_size, len(target.word_to_id), args.trg_embed, args.hidden, start_id, end_id, class_weight, args.mlp_layers, args.max_length, args.dropout, IGNORE_LABEL, reverse_decode=reverse_decode) elif args.net == 'gate-tmpl': model = MLPEncoder2GatedAttentionDecoder(len(source.type_to_id), len(source.player_to_id), len(source.team_to_id), len(source.detail_to_id), source.details_dimention, args.src_embed, args.event_size, len(target.word_to_id), args.trg_embed, args.hidden, start_id, end_id, class_weight, args.mlp_layers, args.max_length, args.dropout, IGNORE_LABEL, source.id_to_player, home_player_tag, away_player_tag, source.id_to_team, home_team_tag, away_team_tag, target.player_to_id, target.players, reverse_decode=reverse_decode) elif args.net == 'disc': model = DiscriminativeMLPEncoder2AttentionDecoder( len(source.type_to_id), len(source.player_to_id), len(source.team_to_id), len(source.detail_to_id), source.details_dimention, args.src_embed, args.event_size, len(target.word_to_id), len(target.content_word_to_id), args.trg_embed, args.hidden, start_id, end_id, class_weight, args.loss_weight, args.disc_loss, args.loss_func, args.mlp_layers, args.max_length, args.dropout, IGNORE_LABEL, reverse_decode=reverse_decode) elif args.net == 'disc-tmpl': model = DiscriminativeMLPEncoder2AttentionDecoder( len(source.type_to_id), len(source.player_to_id), len(source.team_to_id), len(source.detail_to_id), source.details_dimention, args.src_embed, args.event_size, len(target.word_to_id), len(target.content_word_to_id), args.trg_embed, args.hidden, start_id, end_id, class_weight, args.loss_weight, args.disc_loss, args.loss_func, args.mlp_layers, args.max_length, args.dropout, IGNORE_LABEL, source.id_to_player, home_player_tag, away_player_tag, source.id_to_team, home_team_tag, away_team_tag, target.player_to_id, target.players, reverse_decode=reverse_decode) elif args.net == 'gate-disc': model = DiscriminativeMLPEncoder2GatedAttentionDecoder( len(source.type_to_id), len(source.player_to_id), len(source.team_to_id), len(source.detail_to_id), source.details_dimention, args.src_embed, args.event_size, len(target.word_to_id), len(target.content_word_to_id), args.trg_embed, args.hidden, start_id, end_id, class_weight, args.loss_weight, args.disc_loss, args.loss_func, args.mlp_layers, args.max_length, args.dropout, IGNORE_LABEL, reverse_decode=reverse_decode) elif args.net == 'gate-disc-tmpl': model = DiscriminativeMLPEncoder2GatedAttentionDecoder( len(source.type_to_id), len(source.player_to_id), len(source.team_to_id), len(source.detail_to_id), source.details_dimention, args.src_embed, args.event_size, len(target.word_to_id), len(target.content_word_to_id), args.trg_embed, args.hidden, start_id, end_id, class_weight, args.loss_weight, args.disc_loss, args.loss_func, args.mlp_layers, args.max_length, args.dropout, IGNORE_LABEL, source.id_to_player, home_player_tag, away_player_tag, source.id_to_team, home_team_tag, away_team_tag, target.player_to_id, target.players, reverse_decode=reverse_decode) elif args.net == 'conv-gate-disc-tmpl': model = DiscriminativeGLUEncoder2GatedAttentionDecoder( len(source.type_to_id), len(source.player_to_id), len(source.team_to_id), len(source.detail_to_id), source.details_dimention, args.src_embed, args.event_size, len(target.word_to_id), len(target.content_word_to_id), args.trg_embed, args.hidden, start_id, end_id, class_weight, args.loss_weight, args.disc_loss, args.loss_func, args.mlp_layers, args.max_length, args.dropout, IGNORE_LABEL, source.id_to_player, home_player_tag, away_player_tag, source.id_to_team, home_team_tag, away_team_tag, target.player_to_id, target.players, reverse_decode=reverse_decode) model.keyword_ids = [ target.word_to_id['save'], target.word_to_id['block'], target.word_to_id['chance'], target.word_to_id['shot'], target.word_to_id['clearance'], target.word_to_id['kick'], target.word_to_id['ball'], target.word_to_id['blocked'], target.word_to_id['denied'] ] model.id_to_word = target.id_to_word if args.numbering: model.player_id = target.player_id model.team_id = target.team_id if args.gpu is not None: model.use_gpu(args.gpu) opt = optimizers.Adam(args.lr) opt.setup(model) if args.clipping > 0: opt.add_hook(GradientClipping(args.clipping)) if args.decay > 0: opt.add_hook(WeightDecay(args.decay)) N = len(train.source) batch_size = args.batch order_provider = OrderProvider(Sampling.get_random_order(N)) src_train_iter = SequentialIterator(train.source, batch_size, order_provider, args.event_size, source.fillvalue, gpu=args.gpu) if 'disc' in args.net: trg_train_iter = TextAndLabelIterator(train.target, batch_size, order_provider, args.sentence_size, IGNORE_LABEL, gpu=args.gpu) else: trg_train_iter = SequentialIterator(train.target, batch_size, order_provider, args.sentence_size, IGNORE_LABEL, gpu=args.gpu) src_dev_iter = SequentialIterator(dev.source, batch_size, None, args.event_size, source.fillvalue, gpu=args.gpu) trg_dev_iter = Iterator(dev.target, batch_size, wrapper=EndTokenIdRemoval(end_id), gpu=None) src_test_iter = SequentialIterator(test.source, batch_size, None, args.event_size, source.fillvalue, gpu=args.gpu) src_test20_iter = SequentialIterator(test20.source, batch_size, None, args.event_size, source.fillvalue, gpu=args.gpu) src_test15_iter = SequentialIterator(test15.source, batch_size, None, args.event_size, source.fillvalue, gpu=args.gpu) src_test10_iter = SequentialIterator(test10.source, batch_size, None, args.event_size, source.fillvalue, gpu=args.gpu) src_train2_iter = SequentialIterator(train2.source, batch_size, None, args.event_size, source.fillvalue, gpu=args.gpu) trg_train2_iter = Iterator(train2.target, batch_size, wrapper=EndTokenIdRemoval(end_id), gpu=None) trg_test_iter = Iterator(test.target, batch_size, wrapper=EndTokenIdRemoval(end_id), gpu=None) trg_test20_iter = Iterator(test20.target, batch_size, wrapper=EndTokenIdRemoval(end_id), gpu=None) trg_test15_iter = Iterator(test15.target, batch_size, wrapper=EndTokenIdRemoval(end_id), gpu=None) trg_test10_iter = Iterator(test10.target, batch_size, wrapper=EndTokenIdRemoval(end_id), gpu=None) if 'disc' in args.net: trainer = Seq2SeqWithLabelTrainer( model, opt, src_train_iter, trg_train_iter, src_dev_iter, trg_dev_iter, order_provider, evaluate_bleu_and_accuracy, args.epoch, save_path, args.eval_step, src_train2_iter, trg_train2_iter) else: trainer = Seq2SeqTrainer(model, opt, src_train_iter, trg_train_iter, src_dev_iter, trg_dev_iter, order_provider, evaluate_bleu, args.epoch, save_path, args.eval_step, src_train2_iter, trg_train2_iter) trainer.run() # load best model model.load_model(os.path.join(save_path, 'best.model')) if 'disc' in args.net: bleu_score_dev, _, _ = evaluate_bleu_and_accuracy( model, src_dev_iter, trg_dev_iter) bleu_score, _, _ = evaluate_bleu_and_accuracy(model, src_test_iter, trg_test_iter) bleu_score20, _, hypotheses = evaluate_bleu_and_accuracy( model, src_test20_iter, trg_test20_iter) bleu_score15, _, _ = evaluate_bleu_and_accuracy( model, src_test15_iter, trg_test15_iter) bleu_score10, _, _ = evaluate_bleu_and_accuracy( model, src_test10_iter, trg_test10_iter) else: bleu_score_dev, _ = evaluate_bleu(model, src_dev_iter, trg_dev_iter) bleu_score, _ = evaluate_bleu(model, src_test_iter, trg_test_iter) bleu_score20, hypotheses = evaluate_bleu(model, src_test20_iter, trg_test20_iter) bleu_score15, _ = evaluate_bleu(model, src_test15_iter, trg_test15_iter) bleu_score10, _ = evaluate_bleu(model, src_test10_iter, trg_test10_iter) TextFile(os.path.join(save_path, 'hypotheses.txt'), [' '.join(ys) for ys in trainer.hypotheses]).save() print('dev score: {}'.format(bleu_score_dev)) print('test score: {}'.format(bleu_score)) print('test score20: {}'.format(bleu_score20)) print('test score15: {}'.format(bleu_score15)) print('test score10: {}'.format(bleu_score10)) # saving fields pickle_dump(os.path.join(save_path, 'source.pkl'), source) pickle_dump(os.path.join(save_path, 'target.pkl'), target) pickle_dump(os.path.join(save_path, 'target_test.pkl'), target_test)
def predict_on_test(self): """ This function will load the test dataset, pre-process the test images and check the performance of the trained models on unseen data. This will also save the confusion matrix and classification report as csv file in seperate dataframes for each model and for each stage, in the evaluation directory. Arguments: -size_dict : Contains information about the image input image sizes for each of the models -model_name : Name of the model, for example - vgg16, inception_v3, resnet50 etc -stage_no : The training stage of the model. You will have a choice to select the number of training stages. In stage 1, we only fine tune the top 2 dense layers by freezing the convolution base. In stage 2, we will re adjust the weights trained in stage 1 by training the top convolution layers, by freezing the dense layers. """ print("\nStarting model evaluation for stage {}..".format( self.stage_no)) #Create an utility class object to access the class methods utils_obj = Utility(self.input_params, self.path_dict) df_test = utils_obj.load_data("test") test_datagen = ImageDataGenerator( preprocessing_function=utils_obj.init_preprocess_func()) test_generator = test_datagen.flow_from_dataframe( dataframe=df_test, directory=self.path_dict['source'], target_size=utils_obj.init_sizes(), x_col="filenames", y_col="class_label", batch_size=1, class_mode='categorical', color_mode='rgb', shuffle=False) nb_test_samples = len(test_generator.classes) model = utils_obj.get_models(self.stage_no) class_indices = test_generator.class_indices def label_class(cat_name): return (class_indices[cat_name]) df_test['true'] = df_test['class_label'].apply( lambda x: label_class(str(x))) y_true = df_test['true'].values #Predictions (Probability Scores and Class labels) y_pred_proba = model.predict_generator(test_generator, nb_test_samples // 1) y_pred = np.argmax(y_pred_proba, axis=1) df_test['predicted'] = y_pred df_test.to_csv(self.path_dict["eval_path"] + "stage{}/".format(self.stage_no) + '{}_predictions_stage_{}.csv'.format( self.input_params['model_name'], self.stage_no)) dictionary = dict(zip(df_test.true.values, df_test.class_label.values)) #Confusion Matrixs cm = metrics.confusion_matrix(y_true, y_pred) df_cm = pd.DataFrame(cm).transpose() df_cm = df_cm.rename(mapper=dict, index=dictionary, columns=dictionary, copy=True, inplace=False) df_cm.to_csv(self.path_dict["eval_path"] + "stage{}/".format(self.stage_no) + '{}_cm_stage_{}.csv'.format( self.input_params['model_name'], self.stage_no)) print('Confusion matrix prepared and saved..') #Classification Report report = metrics.classification_report(y_true, y_pred, target_names=list( class_indices.keys()), output_dict=True) df_rep = pd.DataFrame(report).transpose() df_rep.to_csv(self.path_dict["eval_path"] + "stage{}/".format(self.stage_no) + '{}_class_report_stage_{}.csv'.format( self.input_params['model_name'], self.stage_no)) print('Classification report prepared and saved..') EvalUtils.plot_confusion_matrix( self, y_true, y_pred, list(test_generator.class_indices.keys())) #General Metrics df_metrics = EvalUtils.get_metrics(self, y_true, y_pred) df_metrics.to_csv(self.path_dict["eval_path"] + "stage{}/".format(self.stage_no) + '{}_metrics_stage_{}.csv'.format( self.input_params['model_name'], self.stage_no)) history_df = pd.read_csv( self.path_dict["model_path"] + "stage{}/".format(self.stage_no) + "{}_history_stage_{}.csv".format(self.input_params['model_name'], self.stage_no)) #Get the train vs validation loss for all epochs EvalUtils.plt_epoch_error(self, history_df) #Generate a complete report and save it as an HTML file in the evaluation folder location EvalUtils.get_complete_report(self, y_true, y_pred, class_indices)
This file primarily performs (a) indexing the input file (b) persisting the indices locally (c) using indices to return search results for a given query """ import time import sys from optparse import OptionParser import utils.Utility as utility from core.Indexing import BuiltFileIndex from core.Searching import SearchIndex if __name__ == "__main__": startTime = time.time() print("Starting: {}".format(utility.get_date_time(startTime))) try: parser = OptionParser() parser.add_option("-f", "--file", dest="file") # parse the input (options, args) = parser.parse_args() # get the to be indexed file input_file = options.file # if not present throw exception if input_file is None: raise Exception("Missing Input File!")
def ProcessDataSource(self: Any, source: Dict[str, Any]) -> None: """Prepare to diff the provided data source.""" source["hash"] = Utility.MD5(self, source["url"]) source["older"] = {} source["old"] = {} source["new"] = {} older: Dict[str, Any] = source["older"] old: Dict[str, Any] = source["old"] new: Dict[str, Any] = source["new"] format: str = source["contentType"].upper() allowRevert: bool = source.get("allowRevert", True) if format == "JSON": source["ext"] = "json" source["filename"] = source["hash"] + "." + source["ext"] old["gist"] = Utility.GetGist(self, source["filename"]) new["raw"] = Utility.FormatJSON(self, Utility.GET(self, source["url"])) if old["gist"] is False: return elif (new["raw"] is not None) and (old["gist"] is not None): if allowRevert is False: older["raw"] = Utility.FormatJSON( self, Utility.GetGistRaw(self, old["gist"], source["filename"], 1), ) old["raw"] = Utility.FormatJSON( self, Utility.GetGistRaw(self, old["gist"], source["filename"]) ) SitRep.DiffJSON(self, source) elif (new["raw"] is not None) and (old["gist"] is None): Utility.CreateGist(self, source) elif format == "IMAGE": source["ext"] = "txt" source["filename"] = source["hash"] + "." + source["ext"] old["gist"] = Utility.GetGist(self, source["filename"]) new["raw"] = Utility.Base64( self, Utility.GET(self, source["url"], raw=True) ) if old["gist"] is False: return elif (new["raw"] is not None) and (old["gist"] is not None): if allowRevert is False: older["raw"] = Utility.GetGistRaw( self, old["gist"], source["filename"], 1 ) old["raw"] = Utility.GetGistRaw(self, old["gist"], source["filename"]) SitRep.DiffImage(self, source) elif (new["raw"] is not None) and (old["gist"] is None): Utility.CreateGist(self, source) elif format == "TEXT": source["ext"] = source.get("fileType", "txt") source["filename"] = source["hash"] + "." + source["ext"] old["gist"] = Utility.GetGist(self, source["filename"]) new["raw"] = Utility.GET(self, source["url"]) if old["gist"] is False: return elif (new["raw"] is not None) and (old["gist"] is not None): if allowRevert is False: older["raw"] = Utility.GetGistRaw( self, old["gist"], source["filename"], 1 ) old["raw"] = Utility.GetGistRaw(self, old["gist"], source["filename"]) SitRep.DiffText(self, source) elif (new["raw"] is not None) and (old["gist"] is None): Utility.CreateGist(self, source) else: logger.error(f"Data source with content type {format} is not supported") logger.debug(source)
try: for link in candidate_links: self.logger.info("Go to next link: " + link) try: self._go_to_page(link) except Exception, err: print str(err) continue soup = BS(self.driver.page_source, "lxml") email = self._on_page(soup, domain) _email_founds.append(email) return _email_founds if not _email_founds else Utility.flatten_list( _email_founds) except Exception, e: logging.error(str(e)) return _email_founds if not _email_founds else Utility.flatten_list( _email_founds) def sort_email(self, emails, domain): # If this is not governor's domain, do not get any email candidate with .go.id domain name if '.go.id' not in domain: emails = [email for email in emails if '.go.id' not in emails] domain_name = Utility.find_domain_name(domain) emails = map(lambda email: (email, domain_name), emails) # Sort based on score descending emails.sort(cmp=lambda a, b: -1 if self.email_scoring(a) > self.email_scoring(b) else 0) emails = [x for x, y in emails] return emails[:cfg.max_email] @staticmethod def email_scoring(email_payload):
def DiffText(self: Any, source: Dict[str, Any]) -> None: """Diff the provided text data source.""" filename: str = source["filename"] url: str = source["url"] allowRevert: bool = source.get("allowRevert", True) older: Dict[str, Any] = source["older"] old: Dict[str, Any] = source["old"] new: Dict[str, Any] = source["new"] if allowRevert is False: older["hash"] = Utility.MD5(self, older["raw"]) old["hash"] = Utility.MD5(self, old["raw"]) new["hash"] = Utility.MD5(self, new["raw"]) if old["hash"] == new["hash"]: logger.info(f"No difference found in {filename} ({url})") return elif (allowRevert is False) and (older["hash"] == new["hash"]): logger.info(f"Ignored revert found in {filename} ({url})") return diff: Iterator[str] = Differ().compare( old["raw"].splitlines(), new["raw"].splitlines() ) desc: str = "" additions: int = 0 deletions: int = 0 for line in diff: if line.startswith("+ "): additions += 1 desc += f"{line}\n" elif line.startswith("- "): deletions += 1 desc += f"{line}\n" desc = Utility.Truncate(self, desc, 4048, split="\n") source["urlTrim"] = Utility.Truncate(self, url, 256) success: bool = SitRep.Notify( self, { "title": source["urlTrim"], "description": f"```diff\n{desc}```", "url": url, "filename": source["filename"], "additions": f"{additions:,}", "deletions": f"{deletions:,}", "diffUrl": source["old"]["gist"].html_url + "/revisions", }, ) # Ensure no changes go without notification if success is True: Utility.UpdateGist(self, source)
def _is_contain_domain(self, domain, email): email_domain = Utility.find_email_domain(email) return email_domain in domain if email_domain else False
def train_stage1(self): """ In this stage, we will freeze all the convolution blocks and train only the newly added dense layers. We will add a global spatial average pooling layer, we will add fully connected dense layers on the output of the base models. We will freeze the convolution base and train only the top layers. We will set all the convolution layers to false, the model should be compiled when all the convolution layers are set to false. Arguments: -input_params : This parameter will contain all the information that the user will input through the terminal """ print( "\nTraining the model by freezing the convolution block and tuning the top layers..." ) st = dt.now() utils_obj = Utility(self.input_params, self.path_dict) #Put if statement here. If model_name != custom then run this block, or else. Do something else. if (self.input_params['model_name'] != 'custom'): base_model = utils_obj.load_imagenet_model() #Adding a global spatial average pooling layer x = base_model.output x = GlobalAveragePooling2D()(x) #Adding a fully-connected dense layer #x = Dense(self.input_params['dense_neurons'], activation='relu', kernel_initializer='he_normal')(x) #Adding the custom layers customlayers = self.input_params['customlayers'] #Adding a final dense output final layer x = customlayers(x) n = utils_obj.no_of_classes() output_layer = Dense( n, activation=self.input_params['outputlayer_activation'], kernel_initializer='glorot_uniform')(x) model_stg1 = Model(inputs=base_model.input, outputs=output_layer) #Define the model model_stg1 = Model(inputs=base_model.input, outputs=output_layer) #Here we will freeze the convolution base and train only the top layers #We will set all the convolution layers to false, the model should be #compiled when all the convolution layers are set to false for layer in base_model.layers: layer.trainable = False else: model_stg1 = self.input_params['custom_model'] #Compiling the model model_stg1.compile( optimizer=optimizers.Adam(lr=self.input_params['stage1_lr']), loss='categorical_crossentropy', metrics=[self.input_params['metric']]) #Normalize the images train_datagen = ImageDataGenerator( preprocessing_function=utils_obj.init_preprocess_func()) val_datagen = ImageDataGenerator( preprocessing_function=utils_obj.init_preprocess_func()) df_train = utils_obj.load_data("train") df_val = utils_obj.load_data("val") train_generator = train_datagen.flow_from_dataframe( dataframe=df_train, directory=self.path_dict['source'], target_size=utils_obj.init_sizes(), x_col="filenames", y_col="class_label", batch_size=self.input_params['batch_size'], class_mode='categorical', color_mode='rgb', shuffle=True) val_generator = val_datagen.flow_from_dataframe( dataframe=df_val, directory=self.path_dict['source'], target_size=utils_obj.init_sizes(), x_col="filenames", y_col="class_label", batch_size=self.input_params['batch_size'], class_mode='categorical', color_mode='rgb', shuffle=True) nb_train_samples = len(train_generator.classes) nb_val_samples = len(val_generator.classes) history = model_stg1.fit_generator( generator=train_generator, steps_per_epoch=nb_train_samples // self.input_params['batch_size'], epochs=self.input_params['epochs1'], validation_data=val_generator, validation_steps=nb_val_samples // self.input_params['batch_size'], callbacks=TrainingUtils.callbacks_list(self, 1), workers=self.input_params['nworkers'], use_multiprocessing=False, max_queue_size=20) #1 for stage 1 hist_df = pd.DataFrame(history.history) hist_csv_file = self.path_dict['model_path'] + "stage{}/".format( 1) + "{}_history_stage_{}.csv".format( self.input_params['model_name'], 1) with open(hist_csv_file, mode='w') as file: hist_df.to_csv(file, index=None) #model_stg1.load_weights(self.path_dict['model_path'] + "stage{}/".format(1) + "{}_weights_stage_{}.hdf5".format(self.input_params['model_name'], 1)) model_stg1.save( self.path_dict['model_path'] + "stage{}/".format(1) + "{}_model_stage_{}.h5".format(self.input_params['model_name'], 1)) TrainingUtils.save_summary(self, model_stg1, 1) TrainingUtils.plot_layer_arch(self, model_stg1, 1) stage1_params = dict() stage1_params['train_generator'] = train_generator stage1_params['val_generator'] = val_generator stage1_params['nb_train_samples'] = nb_train_samples stage1_params['nb_val_samples'] = nb_val_samples print("\nTime taken to train the model in stage 1: ", dt.now() - st) #Start model evaluation for Stage 1 eval_utils = EvalUtils(self.input_params, self.path_dict, 1) eval_utils.predict_on_test() return model_stg1, stage1_params
indent thrice * Bullet point 1 * Bullet point 2 * Bullet point 3 1. item 1 2. item 2 3. item 3 1.item 3-1 """ doc.set_section_text(text=text_1) doc.create_section(2, 'Section 1.1') text_2 = """ This text is centred test 12345 '''test 6789''' """ doc.set_section_text(2, text_2, True) doc.create_section(1, 'Section 2') doc.delete_section(3) # WRITE FILES util = Utility() doc_txt = doc.__str__() doc_xml = util.convert_to_xml(doc.get_wiki()) util.write_file(contents=doc_txt) util.write_file(contents=doc_xml, file_type='xml', xsd_schema='wiki') print('SCRIPT COMPLETED')