def executeRequest(self, requestData): """Executes the POST request and retrieves the correspondent response content. Request headers are generated here :return: the response content """ headers = {"Host": "www.facebook.com", "Origin":"http://www.facebook.com", "Referer":"https://www.facebook.com", "accept-encoding": "gzip,deflate", "accept-language": "en-US,en;q=0.8", "cookie": self._cookie, "pragma": "no-cache", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.122 Safari/537.36", "content-type": "application/x-www-form-urlencoded", "accept": "*/*", "cache-control": "no-cache"} url = "https://www.facebook.com/ajax/mercury/thread_info.php" start = time.time() response = requests.post(url, data=requestData, headers=headers) end = time.time() logger.info("Retrieved in {0:.2f}s".format(end-start)) #Remove additional leading characters msgsData = response.text[9:] return msgsData
def download(youtube_id): temp_dir = tempfile.mkdtemp() # Fake up a YouTube URL since youtube-dl expects one youtube_url = "http://www.youtube.com/watch?v={0}".format(youtube_id) video_filename_template = youtube_id + ".%(ext)s" video_path_template = os.path.join(temp_dir, video_filename_template) # Download a copy of the video from YouTube, but limit the resolution to # no more than "720p". For details on the '--format' option, see # https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection command_args = ["python", "youtube-dl/youtube-dl.py", "--format", "best[height<=720]", "-icw", "-o", video_path_template, youtube_url] results = popen_results(command_args) logger.info(results) files = os.listdir(temp_dir) if not files: return assert len(files) == 1 video_path = os.path.join(temp_dir, files[0]) logger.info(video_path) return video_path
def parseConversation(convPath, out, authors): """ Parse all "relevant" messages and related attributes, for then saving them to a text file. Current message format result example: 2012.06.17 15:27:42 SENDER_1 Message text from sender1 Authors should be a dict to provide a correspondence between the IDs as present in the msgData and eventually preferred aliases. If a key is not present, the ID itself is used as alias for all successive messages. """ with open(convPath, encoding='utf-8') as data_file: actions = json.load(data_file) f = open(out, "w", encoding='utf-8') #TODO consider different type of messages, like when a call or Stickers #Stickes leaves and empty message given that there is no textual content #log:phone-call, log:video-call messages = [] for action in actions: if "log_message_type" in action: logger.info("Skipping message of type: " + action["log_message_type"]) continue msg = parseMessage(action, authors) if msg: messages.append(msg) #FIXME happened that number of lines exceeds previously reported number of messages retrieved for msg in messages: f.write(msg + "\n")
def handle_nick(data, match, client, channels): """ When a user changes their nick, tell everyone in all the channels they're in about it. '^NICK (?P<nick>.*)' :type data: str :type match: dict :type client: Client :type channels: list """ newnick = match['nick'] logger.info("Set new user's nick to '{newnick}'", newnick=newnick) client.send(Protocol.Nick.response(client.nick, newnick)) announce = Protocol.Nick.announce(client, newnick) for cl in set(chain.from_iterable(chan.clients for chan in client.channels)): if cl is not client: cl.send(announce) client.nick = newnick
def install_it(pkg, system='debian'): """ Install a package only if it isn't already installed. If it was impossible the installation of the package the function return False, True otherwise. Possible values of system are: * debian * archlinux """ if system=='debian': cmd = 'sudo apt-get --assume-yes install '+pkg elif system=='archlinux': cmd = 'sudo pacman -S '+pkg info('Installing the package '+pkg+' ...') if is_pack_installed(pkg): return True try: (st, out) = subprocess.getstatusoutput(cmd) except OSError: error('Maybe apt is not installed in this system.') st = 32512 # This the error code of 'command not found' return st==0
def response_xml(xml): xml = [outer.split('</') for outer in [inner.split('>') for inner in xml.split('\n') if 'aspsms' in inner][-1] if '</' in outer] result = dict() for val, key in xml: if val: result[key] = val logger.info('aspsms %s: %s' %(key, val)) return result
def connectionLost(self, reason): logger.info('Lost client "{nick}"', nick=self._client.nick) announce = Protocol.quit(self._client, "Connection lost") for channel in self._client.channels: channel.clients.remove(self._client) channel.send(announce) self.factory.clients.remove(self)
def copy_legacy_content_to_new_location(youtube_id): """Copies the MP4 & PNG files from a legacy-format video in the S3 converted bucket to the new naming scheme.""" for key in converted_bucket.list(prefix="{0}/".format(youtube_id)): legacy_match = re_legacy_video_key_name.match(key.name) assert legacy_match is not None assert legacy_match.group(1) == youtube_id dest_key = "{0}.mp4/{1}".format(youtube_id, legacy_match.group(2)) logger.info("Copying {0} to {1}".format(key.name, dest_key)) key.copy(converted_bucket.name, dest_key, preserve_acl=True)
def mail(params): from mail import send_mail logger.info('cli sends mail') response = send_mail(params.to, params.message, subject=params.subject, subjecttag=params.subjecttag, subjectdate=params.subjectd, cc=params.cc, bcc=params.bcc, files=params.files, sender=params.sender, footer=params.footer) if response == True: parser.exit(status=0, message='success, mail sent\n') else: parser.exit(status=-1, message='fail: %s\n' %(response))
def twitter(params): from twitter import send_tweet logger.info('cli sends tweet') response = send_tweet(params.message, mention=params.mention) if response == True: parser.exit(status=0, message='success, tweet sent\n') else: parser.exit(status=-1, message='fail: %s\n' %(response))
def aspsms(params): from aspsms import send_aspsms logger.info('cli sends aspsms') response = send_aspsms(params.to, params.message, originator=params.origin, flashing=params.flashing, maxchars=params.maxchars) if response == True: parser.exit(status=0, message='success, aspsms sent\n') else: parser.exit(status=-1, message='fail: %s\n' %(response))
def run(self): client = ConnectFactory().getConnect("redis",self.config) for msg in self.consumer: kafkamsg = self._decodemsg(msg) try: logger.info("message handling(%s)" % kafkamsg) jsondata = json.loads(kafkamsg['rawdata']) ObjectFactory.fromjson(jsondata["message"]).execute(client) except: logger.error("message execute error(%s)" % jsondata)
def update_download_available(youtube_id, available_formats): url = "http://www.khanacademy.org/api/v1/videos/%s/download_available" % youtube_id params = { 'formats': ','.join(available_formats), 'key': secrets.ka_download_available_secret, } response = urllib2.urlopen(url, data=urllib.urlencode(params)) logger.info(response.read()) return response.code == 200
def send_mail(to, messagetext, subject=None, **args): cc = args.get('cc', []) bcc = args.get('bcc', []) recipients = list(chain(to, cc, bcc)) sender = args.get('sender', getconf('email_sender')) if not sender: sender = getconf('email_sender') footer = args.get('footer', getconf('email_footer')) if not footer: footer = getconf('email_footer') subjecttag = args.get('subjecttag', getconf('email_defaulttag')) subjectdate = args.get('subjectdate', getconf('email_subject_date')) files = args.get('files', []) logger.info('~' * 23) logger.info('sending new mail using %s:\n%d recipients ~ %d cc, %d bcc, %d files' %(sender, len(recipients), len(cc), len(bcc), len(files))) message = make_header(to, sender, cc, subject, subjecttag, subjectdate) message.attach(make_mime_text(messagetext, footer)) [message.attach(make_mime_file(f)) for f in files] session = dialup() if session is not None: try: session.sendmail(sender, recipients, message.as_string().encode('UTF-8')) except SMTPException as ex: logger.error('smtp error: %s' %(ex)) return ex else: logger.info('mail sent') return True finally: ext_log(session.quit(), 'quit') logger.info('end mail')
def send_aspsms(to, messagetext, **args): originator = args.get('originator', getconf('aspsms_originator')) if not originator: originator = getconf('aspsms_originator') flashing = args.get('flashing', getconf('aspsms_flashing')) maxchars = args.get('maxchars', getconf('aspsms_maxchars')) logger.info('~' * 23) logger.info('sending new aspsms using %s:\n%d recipients ~ flashing: %s' %(originator, len(to), flashing)) message = wrap(messagetext, maxchars) try: for recipient in to: for text in message: payload = make_xml(recipient, originator, text, flashing) response = response_xml(post_xml(payload)) if not response['ErrorCode'] == '1': raise Exception('aspsms error: %s' %(response['ErrorDescription'])) except Exception as ex: logger.error('error: %s' %(ex)) return ex else: logger.info('aspsms sent') return True logger.info('end aspsms')
def printDelayStatsFor(conv): delay = conv.stats.getDelayStats() logger.info("##Reply Delay Stats") logger.info("Reply delay by sender: ") for s, d in delay.items(): msg = "Between {} and {}".format(s.split(':')[0], s.split(':')[1]) logger.info('{} : {}'.format(msg, d)) logger.info('-'*10)
def list_converted_formats(): """Returns a dict that maps youtube_ids (keys) to a set of available converted formats (values)""" converted_videos = defaultdict(set) legacy_video_keys = set() for key in converted_bucket.list(delimiter="/"): video_match = re_video_key_name.match(key.name) if video_match is None: if re_legacy_video_key_name.match(key.name) is not None: legacy_video_keys.add(key.name) else: logger.warning("Unrecognized key {0} is not in format YOUTUBE_ID.FORMAT/".format(key.name)) else: converted_videos[video_match.group(1)].add(video_match.group(2)) logger.info("{0} legacy converted videos were ignored".format(len(legacy_video_keys))) return converted_videos
def start_converting(youtube_id, s3_url, formats_to_create): thumbnail_time = youtube.get_thumbnail_time(youtube_id) assert thumbnail_time zen = Zencoder(zencoder_api_key) outputs = [] for format_to_create in formats_to_create: if format_to_create in output_types(): outputs += [fxn(youtube_id, thumbnail_time) for fxn in output_types()[format_to_create]] job_response = zen.job.create(s3_url, outputs=outputs) assert job_response.code == 201, job_response.body logger.info("Zencoder job created successfully")
def start_converting(youtube_id, s3_url, formats_to_create, base_url=BASE_URL): # TODO(csilvers): figure out how to get thumbnail times from youtube APIv3 # thumbnail_time = youtube.get_thumbnail_time(youtube_id) thumbnail_time = None zen = Zencoder(zencoder_api_key) outputs = [] for format_to_create in formats_to_create: assert format_to_create in output_types(), (format_to_create, output_types()) outputs += [fxn(youtube_id, thumbnail_time, base_url) for fxn in output_types()[format_to_create]] job_response = zen.job.create(s3_url, outputs=outputs) assert job_response.code == 201, job_response.body logger.info("Zencoder job created successfully")
def blog_article(year, month, day, title): created = datetime.datetime(year, month, day) para = {"created": created, "created_next_day": created + datetime.timedelta(days=1), "title": title} logger.info( "SELECT * FROM blog WHERE created >= %(created)s and created < %(created_next_day)s and title = %(title)s" % para ) db.execute( "SELECT * FROM blog WHERE created >= %(created)s and created < %(created_next_day)s and title = %(title)s", para ) r = db.fetchall() for i in r: i["source_file"] = os.path.join("blog", i["source_file"] + ".html") if r: para = {"data": r} return render_template("blog/article.html", **para) else: return render_template("404.html"), 404
def main(): logger.info("Adding hooks...") addhooks() logger.info("Done with hooks!") logger.info("Setting up factory") factory = IRCFactory() factory.protocol = ClientConnection logger.info("Listening") reactor.listenTCP(6667, factory) reactor.run()
def download(youtube_id): temp_dir = tempfile.mkdtemp() # Fake up a YouTube URL since youtube-dl expects one youtube_url = "http://www.youtube.com/watch?v={0}".format(youtube_id) video_filename_template = youtube_id + ".%(ext)s" video_path_template = os.path.join(temp_dir, video_filename_template) command_args = ["python", "youtube-dl/youtube-dl.py", "--max-quality", "22", "-icw", "-o", video_path_template, youtube_url] results = popen_results(command_args) logger.info(results) files = os.listdir(temp_dir) assert len(files) == 1 video_path = os.path.join(temp_dir, files[0]) logger.info(video_path) return video_path
def handle_user(data, match, client, channels): """ This is a handshake. We'll blatantly disregard the whole pinging thing, because I don't really care. Clients still ping the server as it is. We'll see how it works out. '^USER (?P<user>[^\s]+) (?P<mode>[^\s]+) (?P<junk>[^\s]+) :(?P<name>.*)' :type data: str :type match: dict :type client: Client :type channels: list """ client.name = match['name'] client.host = match['host'] logger.info("Set new user's name to '{name}'", name=client.name) logger.info("Set new user's host to '{host}'", host=client.host) response = Protocol.handshake(client) return [line.format(nick=client.nick) for line in response]
def get_or_create_unconverted_source_url(youtube_id): matching_keys = list(unconverted_bucket.list(youtube_id)) matching_key = None if len(matching_keys) > 0: if len(matching_keys) > 1: logger.warning("More than 1 matching unconverted video URL found for video {0}".format(youtube_id)) matching_key = matching_keys[0] else: logger.info("Unconverted video not available on s3 yet, downloading from youtube to create it.") video_path = youtube.download(youtube_id) logger.info("Downloaded video to {0}".format(video_path)) assert(video_path) video_extension = splitext(video_path)[1] assert video_extension[0] == "." video_extension = video_extension[1:] if video_extension not in ["flv", "mp4"]: logger.warning("Unrecognized video extension {0} when downloading video {1} from YouTube".format(video_extension, youtube_id)) matching_key = Key(unconverted_bucket, "{0}/{0}.{1}".format(youtube_id, video_extension)) matching_key.set_contents_from_filename(video_path) os.remove(video_path) logger.info("Deleted {0}".format(video_path)) return "s3://{0}/{1}".format(unconverted_bucket.name, matching_key.name)
def updateswitch(switch_ip,dms_location,redis_server): logger.info('start to update switch(%s)' % switch_ip) headers = {'content-type': 'application/json'} switch = {} switch['managementIp'] = switch_ip keys = redis_server.hkeys('locations') location_id = None for key in keys: value = redis_server.hget('locations',key) if value == dms_location: location_id = key if not location_id: print 'can not find corresponding location id' logger.error("cannot find the location id") sys.exit(-1) switch['locationId'] = location_id switch_json = json.dumps(switch) logger.info('request:%s' % switch_json) r = requests.post('%s/switch' % dso_url,data=switch_json,headers=headers) if r.status_code == 201: print 'notify dso to bind location and switch successfully.' else: print 'fail to notify dso to bind location and switch:status_code(%s),content(%s)' % (r.status_code,r.content)
def deleteport(switch_json): logger.info('start to delete port:(%s)' % switch_json) headers = {'content-type': 'application/json'} r = requests.delete("%s/switch" % dso_url,data=switch_json,headers=headers) logger.info(r.status_code) logger.info(r.content) if r.status_code == 200: print 'notify dso to delete ports successfully.' else: print r.content sys.exit(-1)
def createswitch(switch_json): logger.info('start to create switch:(%s)' % switch_json) headers = {'content-type': 'application/json'} r = requests.post("%s/switch" % dso_url,data=switch_json,headers=headers) logger.info(r.status_code) logger.info(r.content) if r.status_code == 201: print 'notify dso to create switch successfully.' else: print r.content print 'fail to notify dso to create switch.' sys.exit(-1)
def repost_replies(account_name): bf = open('.blacklist_%s'%account_name,'a+') blacklist = bf.read().splitlines() bf.close() rp = open('.reposted_%s'%account_name,'a+') reposted = rp.read().splitlines() account = settings.ACCOUNTS.get(account_name) try: logging.info('[%s] Getting last mentions offset'%account_name) bot = TwitterBot(settings.CONSUMER_KEY,settings.CONSUMER_SECRET, account['key'],account['secret']) mentions = [] try: mentions = bot.api.mentions() logging.info('[%s] Got %d mentions'%(account_name,len(mentions))) except Exception,e: logging.error('[%s] Failed to get mentions. %s'%(account_name,e)) for mess in reversed(mentions): try: author = mess.author.screen_name if str(author) in blacklist: logging.debug('[%s] Author %s blacklisted. Skipping.'%(account_name,str(author))) continue if str(mess.id) in reposted: logging.debug('[%s] Message #%s already reposted. Skipping.'%(account_name,str(mess.id))) continue message = mess.text.split(' ') if message[0] != '@%s'%account_name: continue #not a "@reply" trigger = message[1] triggers = dict(account['triggers']) if trigger not in triggers: logging.warning('[%s] Bad message format, sending DM to author'%account_name) bot.dm(author,account['not_triggered']) else: len_params = {'message':'','user':author} mess_len = len(triggers[trigger]%len_params) params = {'message':bot.trim_message(' '.join(message[2:]),mess_len),'user':author} message = triggers[trigger]%params logging.info('[%s] Tweeting message %s'%(account_name,message)) bot.tweet(message) rp.write('%s\n'%mess.id) except Exception,e: logging.error('%s'%e) continue
def get_or_create_unconverted_source_url(youtube_id): matching_keys = list(unconverted_bucket.list(youtube_id)) # TODO(alpert): How do these .part files get created? They're not real # video files and should be ignored. matching_keys = [key for key in matching_keys if not key.name.endswith('.part')] matching_key = None if matching_keys: if len(matching_keys) > 1: logger.warning("More than 1 matching unconverted video " "URL found for video {0}".format(youtube_id)) matching_key = matching_keys[0] else: logger.info("Unconverted video not available on s3 yet, " "downloading from youtube to create it.") video_path = youtube.download(youtube_id) if not video_path: logger.warning("Error downloading video {0}".format(youtube_id)) return logger.info("Downloaded video to {0}".format(video_path)) video_extension = splitext(video_path)[1] assert video_extension[0] == "." video_extension = video_extension[1:] if video_extension not in ["flv", "mp4"]: logger.warning("Unrecognized video extension {0} when downloading " "video {1} from YouTube".format( video_extension, youtube_id)) matching_key = Key(unconverted_bucket, "{0}/{0}.{1}".format( youtube_id, video_extension)) matching_key.set_contents_from_filename(video_path) os.remove(video_path) logger.info("Deleted {0}".format(video_path)) return "s3://{0}/{1}".format(unconverted_bucket.name, matching_key.name)
def change_baud(self, baud): self._close() logger.info("Change channel %s from %s to %s" % (self.ser.port, self.ser.baudrate, int(baud))) self.ser.baudrate = int(baud) return self._open()
def scrapeConversation(self, merge, offset, timestampOffset, chunkSize, limit, isGroupConversation): """Retrieves conversation messages and stores them in a JSON file If merge is specified, the new messages will be merged with the previous version of the conversation, if present """ if merge: if not os.path.exists(self._directory + "conversation.json"): logger.error("Conversation not present. Merge operation not possible") return with open(self._directory + "conversation.json") as conv: convMessages = json.load(conv) numMergedMsgs = 0 if not os.path.exists(self._directory): os.makedirs(self._directory) logger.info("Starting scraping of conversation {}".format(self._convID)) messages = [] msgsData = "" timestamp = "" if timestampOffset == 0 else str(timestampOffset) while self.CONVERSATION_ENDMARK not in msgsData: requestChunkSize = chunkSize if limit <= 0 else min(chunkSize, limit-len(messages)) reqData = self.generateRequestData(offset, timestamp, requestChunkSize, isGroupConversation) logger.info("Retrieving messages " + str(offset) + "-" + str(requestChunkSize+offset)) msgsData = self.executeRequest(reqData) jsonData = json.loads(msgsData) if jsonData and ('payload' in jsonData) and jsonData['payload']: if ('actions' in jsonData['payload']) and jsonData['payload']['actions']: actions = jsonData['payload']['actions'] #case when the last message already present in the conversation #is older newer than the first one of the current retrieved chunk if merge and convMessages[-1]["timestamp"] > actions[0]["timestamp"]: for i, action in enumerate(actions): if convMessages[-1]["timestamp"] == actions[i]["timestamp"]: numMergedMsgs = len(actions[i+1:-1]) + len(messages) messages = convMessages + actions[i+1:-1] + messages break break #We retrieve one message two times, as the first one of the previous chunk #and as the last one of the new one. So we here remove the duplicate, #but only once we already retrieved at least one chunk if len(messages) == 0: messages = actions else: messages = actions[:-1] + messages #update timestamp timestamp = str(actions[0]["timestamp"]) else: if 'errorSummary' in jsonData: logger.error("Response error: " + jsonData['errorSummary']) else: logger.error("Response error. No messages found") logger.error(msgsData) return else: logger.error("Response error. Empty data or payload") logger.error(msgsData) logger.info("Retrying in " + str(self.ERROR_WAIT) + " seconds") time.sleep(self.ERROR_WAIT) continue offset += chunkSize if limit!= 0 and len(messages) >= limit: break time.sleep(self.REQUEST_WAIT) if merge: logger.info("Successfully merged {} new messages".format(numMergedMsgs)) logger.info("Conversation total message count = {}".format(len(messages))) else: logger.info("Conversation scraped successfully. {} messages retrieved".format(len(messages))) self.writeMessages(messages)
def cw_tree_attack(): cw = CarliniL2_qa(debug=args.debugging) criterion = nn.CrossEntropyLoss() loss = 0 tot = 0 adv_loss = 0 targeted_success = 0 untargeted_success = 0 adv_text = [] answers = dict() adv_answers = dict() embed = torch.load(args.word_vector) device = torch.device("cuda:0" if args.cuda else "cpu") vocab = Vocab(filename=args.dictionary, data=[PAD_WORD, UNK_WORD, EOS_WORD, SOS_WORD]) generator = Generator(args.test_data, vocab=vocab, embed=embed) transfered_embedding = torch.load('bidaf_transfered_embedding.pth') transfer_emb = torch.nn.Embedding.from_pretrained(transfered_embedding).to( device) seqback = WrappedSeqback(embed, device, attack=True, seqback_model=generator.seqback_model, vocab=vocab, transfer_emb=transfer_emb) treelstm = generator.tree_model generator.load_state_dict(torch.load(args.load_ae)) backup_params = EMA(0) for name, param in model.named_parameters(): if param.requires_grad: backup_params.register(name, param.data) param.data.copy_(ema.get(name)) class TreeModel(nn.Module): def __init__(self): super(TreeModel, self).__init__() self.inputs = None def forward(self, hidden): self.embedding = seqback(hidden) return model(batch, perturbed=self.embedding) def set_temp(self, temp): seqback.temp = temp def get_embedding(self): return self.embedding def get_seqback(self): return seqback tree_model = TreeModel() for batch in tqdm(iter(data.dev_iter), total=1000): p1, p2 = model(batch) orig_answer, orig_s_idx, orig_e_idx = write_to_ans( p1, p2, batch, answers) batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() append_info = append_input(batch, vocab) batch_add_start = append_info['add_start'] batch_add_end = append_info['add_end'] batch_start_target = torch.LongTensor( append_info['target_start']).to(device) batch_end_target = torch.LongTensor( append_info['target_end']).to(device) add_sents = append_info['append_sent'] input_embedding = model.word_emb(batch.c_word[0]) append_info['tree'] = [generator.get_tree(append_info['tree'])] seqback.sentences = input_embedding.clone().detach() seqback.batch_trees = append_info['tree'] seqback.batch_add_sent = append_info['ae_sent'] seqback.start = append_info['add_start'] seqback.end = append_info['add_end'] seqback.adv_sent = [] batch_tree_embedding = [] for bi, append_sent in enumerate(append_info['ae_sent']): sentences = [ torch.tensor(append_sent, dtype=torch.long, device=device) ] trees = [append_info['tree'][bi]] tree_embedding = treelstm(sentences, trees)[0][0].detach() batch_tree_embedding.append(tree_embedding) hidden = torch.cat(batch_tree_embedding, dim=0) cw.batch_info = append_info cw.num_classes = append_info['tot_length'] cw.run(tree_model, hidden, (batch_start_target, batch_end_target), input_token=input_embedding) seqback.adv_sent = [] # re-test for bi, (add_start, add_end) in enumerate(zip(batch_add_start, batch_add_end)): if bi in cw.o_best_sent: ae_words = cw.o_best_sent[bi] bidaf_tokens = bidaf_convert_to_idx(ae_words) batch.c_word[0].data[bi, add_start:add_end] = torch.LongTensor( bidaf_tokens) p1, p2 = model(batch) adv_answer, adv_s_idx, adv_e_idx = write_to_ans( p1, p2, batch, adv_answers) batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) adv_loss += batch_loss.item() for bi, (start_target, end_target) in enumerate( zip(batch_start_target, batch_end_target)): start_output = adv_s_idx end_output = adv_e_idx targeted_success += int( compare(start_output, start_target.item(), end_output, end_target.item())) untargeted_success += int( compare_untargeted(start_output, start_target.item(), end_output, end_target.item())) for i in range(len(add_sents)): logger.info(("orig:", transform(add_sents[i]))) try: logger.info(("adv:", cw.o_best_sent[i])) adv_text.append({ 'adv_text': cw.o_best_sent[i], 'qas_id': batch.id[i], 'adv_predict': (orig_s_idx, orig_e_idx), 'orig_predict': (adv_s_idx, adv_e_idx), 'Orig answer:': orig_answer, 'Adv answer:': adv_answer }) joblib.dump(adv_text, root_dir + '/adv_text.pkl') except: adv_text.append({ 'adv_text': transform(add_sents[i]), 'qas_id': batch.id[i], 'adv_predict': (orig_s_idx, orig_e_idx), 'orig_predict': (adv_s_idx, adv_e_idx), 'Orig answer:': orig_answer, 'Adv answer:': adv_answer }) joblib.dump(adv_text, root_dir + '/adv_text.pkl') continue # for batch size = 1 tot += 1 logger.info(("orig predict", (orig_s_idx, orig_e_idx))) logger.info(("adv append predict", (adv_s_idx, adv_e_idx))) logger.info(("targeted successful rate:", targeted_success)) logger.info(("untargetd successful rate:", untargeted_success)) logger.info(("Orig answer:", orig_answer)) logger.info(("Adv answer:", adv_answer)) logger.info(("tot:", tot)) for name, param in model.named_parameters(): if param.requires_grad: param.data.copy_(backup_params.get(name)) with open(options.prediction_file, 'w', encoding='utf-8') as f: print(json.dumps(answers), file=f) with open(options.prediction_file + '_adv.json', 'w', encoding='utf-8') as f: print(json.dumps(adv_answers), file=f) results = evaluate.main(options) logger.info(tot) logger.info(("adv loss, results['exact_match'], results['f1']", loss, results['exact_match'], results['f1'])) return loss, results['exact_match'], results['f1']
except: # result[data['qas_id']] = [' '.join(transform(data['adv_text']))] result[data['qas_id']] = None return result if __name__ == '__main__': options = args device = torch.device("cuda:{}".format(options.gpu)) best_model_file_name = "model.bin" best_ema = "ema.pth" # ===-----------------------------------------------------------------------=== # Log some stuff about this run # ===-----------------------------------------------------------------------=== logger.info(' '.join(sys.argv)) logger.info('') logger.info(options) logger.info('loading SQuAD data...') data = SQuAD(options) setattr(options, 'char_vocab_size', len(data.CHAR.vocab)) setattr(options, 'word_vocab_size', len(data.WORD.vocab)) if options.test_file is not None: print("testing") setattr(options, 'dataset_file', '.data/squad/{}'.format(options.test_file)) else: setattr(options, 'dataset_file', '.data/squad/{}'.format(options.dev_file)) setattr(options, 'prediction_file',
def check_backups_completeness(full_backup_info, increment_backup_info): """校验备份完整性""" if len(increment_backup_info) == 0: logger.info('进行全量恢复') logger.info('备份完整') return True elif len(increment_backup_info) > 0: logger.info('进行全量恢复 + 增量恢复') checkpoint_lsn = full_backup_info[0]['to_lsn'] for ibi in increment_backup_info: if checkpoint_lsn != ibi['from_lsn']: logger.info('备份不完整') logger.info('id为' + ibi['id'] + '的增量备份信息出错') sys.exit(1) else: checkpoint_lsn = ibi['to_lsn'] logger.info('备份完整') return True
def exec_start_mysqld_safe(mysql_config_path): """使用mysqld_safe启动mysql服务""" logger.info('尝试使用mysqld_safe启动mysql') cmd = ['mysqld_safe', '--defaults-file=' + mysql_config_path] util.exec.exec_cmd(cmd, exec_start_mysqld_safe.__doc__, backgroud=True)
def update_ppo_parameters(self, paths): ''' @brief: update the ppo ''' # step 1: get the data dict ob_normalizer_info = paths.pop() feed_dict = self.prepared_network_feeddict(paths) # step 2: train the network logger.info('| %11s | %11s | %11s | %11s| %11s|' % ('surr', 'kl', 'ent', 'vf_loss', 'weight_l2')) self.timesteps_so_far += self.args.timesteps_per_batch for i_epochs in range(self.args.optim_epochs + self.args.extra_vf_optim_epochs): minibatch_id_candidate = range( feed_dict[self.action_placeholder].shape[0]) self._npr.shuffle(minibatch_id_candidate) # make sure that only timesteps per batch is used minibatch_id_candidate = \ minibatch_id_candidate[: self.args.timesteps_per_batch] current_id = 0 surrogate_epoch, kl_epoch, entropy_epoch, vf_epoch, weight_epoch = \ [], [], [], [], [] while current_id + self.args.optim_batch_size <= \ len(minibatch_id_candidate) and current_id >= 0: # fetch the minidata batch sub_feed_dict, current_id, minibatch_id_candidate = \ self.construct_minibatchFeeddict_from_feeddict( feed_dict, minibatch_id_candidate, current_id, self.args.optim_batch_size, is_all_feed=self.args.minibatch_all_feed ) if i_epochs < self.args.optim_epochs: # train for one iteration in this epoch _, i_surrogate_mini, i_kl_mini, i_entropy_mini, \ i_weight_mini = self.session.run( [self.update_op, self.surr, self.kl, self.ent, self.weight_decay_loss], feed_dict=sub_feed_dict ) # train the value network with fixed network coeff _, i_vf_mini = self.session.run( [self.update_vf_op, self.vf_loss], feed_dict=sub_feed_dict) surrogate_epoch.append(i_surrogate_mini) kl_epoch.append(i_kl_mini) entropy_epoch.append(i_entropy_mini) vf_epoch.append(i_vf_mini) weight_epoch.append(i_weight_mini) else: # only train the value function, might be unstable if share # the value network and policy network _, i_vf_mini = self.session.run( [self.update_vf_op, self.vf_loss], feed_dict=sub_feed_dict) vf_epoch.append(i_vf_mini) if i_epochs < self.args.optim_epochs: surrogate_epoch = np.mean(surrogate_epoch) kl_epoch = np.mean(kl_epoch) entropy_epoch = np.mean(entropy_epoch) vf_epoch = np.mean(vf_epoch) weight_epoch = np.mean(weight_epoch) else: surrogate_epoch = -0.1 kl_epoch = -0.1 entropy_epoch = -0.1 weight_epoch = -0.1 vf_epoch = np.mean(vf_epoch) # if we use kl_penalty, we will do early stopping if needed if self.args.use_kl_penalty: assert self.args.minibatch_all_feed, logger.error( 'KL penalty not available for epoch minibatch training') if kl_epoch > 4 * self.args.target_kl and \ self.args.minibatch_all_feed: logger.info('Early Stopping') break logger.info('| %10.8f | %10.8f | %10.4f | %10.4f | %10.4f |' % (surrogate_epoch, kl_epoch, entropy_epoch, vf_epoch, weight_epoch)) i_surrogate_total, i_kl_total, i_entropy_total, \ i_vf_total, i_weight_total = self.session.run( [self.surr, self.kl, self.ent, self.vf_loss, self.weight_decay_loss], feed_dict=feed_dict ) # step 3: update the hyperparameters of updating self.update_adaptive_hyperparams(kl_epoch, i_kl_total) # step 4: record the stats stats = {} episoderewards = np.array([path["rewards"].sum() for path in paths]) stats["avg_reward"] = episoderewards.mean() stats["entropy"] = i_entropy_total stats["kl"] = i_kl_total stats["surr_loss"] = i_surrogate_total stats["vf_loss"] = i_vf_total stats["weight_l2_loss"] = i_weight_total stats['learning_rate'] = self.current_lr if self.args.use_kl_penalty: stats['kl_lambda'] = self.current_kl_lambda # step 5: record the summary and save checkpoints self.record_summary_and_ckpt(paths, stats, ob_normalizer_info) return stats
def run(): logger.info('using device:{}'.format(config.DEVICE)) # 对话model dialogue_model, _ = create_model(pre_trained=True) dialogue_model.to(config.DEVICE) dialogue_model.eval() # 互信息mmi model mmi_model, _ = create_model(pre_trained=True, mmi=True) mmi_model.to(config.DEVICE) mmi_model.eval() if not os.path.exists(config.SAVE_SAMPLES_PATH): os.makedirs(config.SAVE_SAMPLES_PATH) samples_file = open(config.SAVE_SAMPLES_PATH + "/mmi_samples.txt", "a", encoding="utf8") samples_file.write("聊天记录: {}\n".format(datetime.now())) # 存储聊天记录,每个utterance以token的id的形式进行存储 history = [] print('开始和 chatbot 聊天,输入CTRL + Z以退出') while True: try: text = input("user: "******"user: {}\n".format(text)) history.append(config.TOKENIZER.encode(text)) input_ids = [config.TOKENIZER.cls_token_id] for history_id, history_utter in enumerate( history[-config.MAX_HISTORY_LEN:]): input_ids.extend(history_utter) input_ids.append(config.TOKENIZER.sep_token_id) # 用于批量生成response,维度为(batch_size, token_len) input_ids = [ copy.deepcopy(input_ids) for _ in range(config.BATCH_SIZE) ] curr_input_tensors = torch.tensor(input_ids).long().to( config.DEVICE) # 二维数组,维度为(生成的response的最大长度,batch_size), # generated[i,j]表示第j个response的第i个token的id generated = [] # 标记是否所有response均已生成结束,若第i个response生成结束, # 即生成了sep_token_id,则将i放入finish_set finish_set = set() # 最多生成max_len个token for _ in range(config.MAX_LEN): outputs = dialogue_model(input_ids=curr_input_tensors) next_token_logits = outputs[0][:, -1, :] # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 for index in range(config.BATCH_SIZE): for token_id in set( [token_ids[index] for token_ids in generated]): next_token_logits[index][ token_id] /= config.REPETITION_PENALTY next_token_logits = next_token_logits / config.TEMPERATURE # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token for next_token_logit in next_token_logits: next_token_logit[config.TOKENIZER.convert_tokens_to_ids( "[UNK]")] = -float("Inf") filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=config.TOP_K, top_p=config.TOP_P) # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) # 判断是否有response生成了[SEP],将已生成了[SEP]的response进行标记 for index, token_id in enumerate(next_token[:, 0]): if token_id == config.TOKENIZER.sep_token_id: finish_set.add(index) # 检验是否所有的response均已生成[SEP] finish_flag = True # 是否所有的response均已生成[SEP]的token for index in range(config.BATCH_SIZE): if index not in finish_set: # response批量生成未完成 finish_flag = False break if finish_flag: break generated.append([token.item() for token in next_token[:, 0]]) # 将新生成的token与原来的token进行拼接 curr_input_tensors = torch.cat( (curr_input_tensors, next_token), dim=-1) candidate_responses = [] # 生成的所有候选response for batch_index in range(config.BATCH_SIZE): response = [] for token_index in range(len(generated)): if generated[token_index][ batch_index] != config.TOKENIZER.sep_token_id: response.append(generated[token_index][batch_index]) else: break candidate_responses.append(response) # mmi模型的输入 if config.DEBUG: print("candidate response:") samples_file.write("candidate response:\n") min_loss = float("Inf") best_response = "" for response in candidate_responses: mmi_input_id = [config.TOKENIZER.cls_token_id ] # 每个input以[CLS]为开头 mmi_input_id.extend(response) mmi_input_id.append(config.TOKENIZER.sep_token_id) for history_utter in reversed( history[-config.MAX_HISTORY_LEN:]): mmi_input_id.extend(history_utter) mmi_input_id.append(config.TOKENIZER.sep_token_id) mmi_input_tensor = torch.tensor(mmi_input_id).long().to( config.DEVICE) out = mmi_model(input_ids=mmi_input_tensor, labels=mmi_input_tensor) loss = out[0].item() if config.DEBUG: text = config.TOKENIZER.convert_ids_to_tokens(response) print("{} loss:{}".format("".join(text), loss)) samples_file.write("{} loss:{}\n".format("".join(text), loss)) if loss < min_loss: best_response = response min_loss = loss history.append(best_response) text = config.TOKENIZER.convert_ids_to_tokens(best_response) print("chatbot:" + "".join(text)) if config.SAVE_SAMPLES_PATH: samples_file.write("chatbot:{}\n".format("".join(text))) except KeyboardInterrupt: if config.SAVE_SAMPLES_PATH: samples_file.close() break
def wrapped(self): logger.info("Accessing {}".format(method.__name__)) return method(self)
def printIntervalStatsFor(start, end, interval, days): logger.info("##Conv Interval") logger.info("Conversation started: {}".format(str(start))) logger.info("Conversation ended: {}".format(str(end))) logger.info("Conversation overall duration: {}".format(interval)) logger.info("{:.0f} days without messages".format(len(days))) percentage = (len(days) / (interval.days + 1)) * 100 logger.info( "{0:.2f}% out of the conversation overall days-interval".format( percentage)) #logger.info(days) logger.info('-' * 10)
def printLexicalStats(lexicalStatsDf): logger.info("##LEXICAL STATS") for sender, vals in lexicalStatsDf.iterrows(): tokensCount, vocabularyCount, lexicalRichness = vals.tolist() logger.info("#" + sender) logger.info("Tokens count: {:.0f}".format(tokensCount)) logger.info("Distinct tokens count: {:.0f}".format(vocabularyCount)) logger.info("Lexical diversity: {0:.5f}".format(lexicalRichness)) logger.info('-' * 10)
def printBasicLengthStats(basicLengthStatsDf): logger.info("##BASIC LENGTH STATS") for sender, vals in basicLengthStatsDf.iterrows(): totalNum, totalLength, avgLegth = vals.tolist() logger.info("#" + sender) logger.info("Number of messages: {:.0f}".format(totalNum)) logger.info("Total length: {:.0f}".format(totalLength)) logger.info("Average length: {0:.2f}".format(avgLegth)) logger.info('-' * 10)
def printEmoticonsStats(emoticonsStatsDf): logger.info("##EMOTICONS STATS") for sender, vals in emoticonsStatsDf.iterrows(): numEmoticons, emoticonsRatio, lenMsgs = vals.tolist() logger.info("#" + sender) logger.info("Emoticons count: {:.0f}".format(numEmoticons)) logger.info("Messages total length: {:.0f}".format(lenMsgs)) logger.info("Ratio: {0:.5f}".format(emoticonsRatio)) logger.info('-' * 10)
def registered(self, driver, frameworkId, masterInfo): """Upon successful intial registration to Mesos cluster.""" logger.info("Registered with framework ID %s", frameworkId.value)
def cw_tree_attack(data_val, tree_data): adv_correct = 0 targeted_success = 0 untargeted_success = 0 orig_correct = 0 tot = 0 orig_append_correct = 0 adv_pickle = [] cw = CarliniL2(debug=args.debugging) embed = torch.load(args.word_vector) device = torch.device("cuda:0" if args.cuda else "cpu") vocab = Vocab(filename=args.dictionary, data=[PAD_WORD, UNK_WORD, EOS_WORD, SOS_WORD]) generator = Generator(args.test_data, vocab=vocab, embed=embed, data_set=data_val) bert_transfered_embedding = torch.load('bert_transfered_embedding.pth') transfer_emb = torch.nn.Embedding( bert_transfered_embedding.size(0), bert_transfered_embedding.size(1)).to(device) # transfer_emb = torch.nn.Embedding.from_pretrained(bert_transfered_embedding).to(device) transfer_emb.weight.data.copy_(bert_transfered_embedding) seqback = WrappedSeqback(embed, device, attack=True, seqback_model=generator.seqback_model, vocab=vocab, transfer_emb=transfer_emb) treelstm = generator.tree_model generator.load_state_dict(torch.load(args.load_ae)) class TreeModel(nn.Module): def __init__(self): super(TreeModel, self).__init__() def forward(self, hidden): self.embedding = seqback(hidden) return model(batch['data'], batch['seq_len'], perturbed=self.embedding)['pred'] def set_temp(self, temp): seqback.temp = temp def get_embedding(self): return self.embedding def get_seqback(self): return seqback tree_model = TreeModel() for batch in get_tree_batch(data_val, tree_data, vocab): input_embedding = model.bert.embeddings.word_embeddings(batch['data']) batch['tree'] = [generator.get_tree(tree) for tree in batch['tree']] seqback.sentences = input_embedding.clone().detach() seqback.batch_trees = batch['tree'] seqback.batch_add_sent = batch['ae_add_sents'] seqback.start = batch['add_start'] seqback.end = batch['add_end'] seqback.adv_sent = [] batch_tree_embedding = [] for bi, append_sent in enumerate(batch['ae_add_sents']): sentences = [ torch.tensor(append_sent, dtype=torch.long, device=device) ] trees = [batch['tree'][bi]] tree_embedding = treelstm(sentences, trees)[0][0].detach() batch_tree_embedding.append(tree_embedding) hidden = torch.cat(batch_tree_embedding, dim=0) cw.batch_info = batch adv_hidden = cw.run(tree_model, hidden, batch['attack_targets'], batch_size=hidden.shape[0], input_token=input_embedding) seqback.adv_sent = [] adv_seq = torch.tensor(batch['data']).to(device) for bi, (add_start, add_end) in enumerate( zip(batch['add_start'], batch['add_end'])): if bi in cw.o_best_sent: ae_words = cw.o_best_sent[bi] bert_tokens = tokenizer.convert_tokens_to_ids(ae_words) adv_seq[bi, add_start:add_end] = torch.LongTensor(bert_tokens) out = model(adv_seq, batch['seq_len'])['pred'] prediction = torch.max(out, 1)[1] orig_correct += batch['orig_correct'].item() orig_append_correct += batch['orig_append_correct'].item() adv_correct += torch.sum((prediction == batch['label']).float()).item() targeted_success += torch.sum( (prediction == batch['attack_targets']).float()).item() untargeted_success += untargeted_success_rate(prediction, batch['label']) tot += len(batch['label']) for i in range(len(batch['label'])): adv_pickle.append({ 'raw_text': transform(adv_seq[i]), 'label': batch['label'][i].item() }) try: logger.info(("orig:", transform(batch['add_sents'][i]))) logger.info(("adv:", cw.o_best_sent[i])) except: continue logger.info(("orig_correct:", orig_correct)) logger.info(("orig_append_correct:", orig_append_correct)) logger.info(("adv_correct:", adv_correct)) logger.info(("targeted successful rate:", targeted_success)) logger.info(("untargetd successful rate:", untargeted_success)) logger.info(("tot:", tot)) joblib.dump(adv_pickle, root_dir + '/adv_text.pkl') logger.info(("orig_correct:", orig_correct / tot)) logger.info(("orig_append_correct:", orig_append_correct / tot)) logger.info(("adv_correct:", adv_correct / tot)) logger.info(("targeted successful rate:", targeted_success / tot)) logger.info(("untargetd successful rate:", untargeted_success / tot))
def scrapeConversation(self, merge, offset, timestampOffset, chunkSize, limit, isGroupConversation): """Retrieves conversation messages and stores them in a JSON file If merge is specified, the new messages will be merged with the previous version of the conversation, if present """ if merge: if not os.path.exists( os.path.join(self._directory, "conversation.json")): logger.error( "Conversation not present. Merge operation not possible") return with open(os.path.join(self._directory, "conversation.json")) as conv: convMessages = json.load(conv) numMergedMsgs = 0 if not os.path.exists(self._directory): os.makedirs(self._directory) logger.info("Starting scraping of conversation {}".format( self._convID)) messages = [] msgsData = "" timestamp = "" if timestampOffset == 0 else str(timestampOffset) while self.CONVERSATION_ENDMARK not in msgsData: requestChunkSize = chunkSize if limit <= 0 else min( chunkSize, limit - len(messages)) reqData = self.generateRequestData(offset, timestamp, requestChunkSize, isGroupConversation) logger.info("Retrieving messages {}-{}".format( offset, requestChunkSize + offset)) msgsData = self.executeRequest(reqData) jsonData = json.loads(msgsData) if jsonData and ('payload' in jsonData) and jsonData['payload']: if ('actions' in jsonData['payload'] ) and jsonData['payload']['actions']: actions = jsonData['payload']['actions'] #case when the last message already present in the conversation #is older newer than the first one of the current retrieved chunk if merge and convMessages[-1]["timestamp"] > actions[0][ "timestamp"]: for i, action in enumerate(actions): if convMessages[-1]["timestamp"] == actions[i][ "timestamp"]: numMergedMsgs = len( actions[i + 1:-1]) + len(messages) messages = convMessages + actions[ i + 1:-1] + messages break break #We retrieve one message two times, as the first one of the previous chunk #and as the last one of the new one. So we here remove the duplicate, #but only once we already retrieved at least one chunk if len(messages) == 0: messages = actions else: messages = actions[:-1] + messages #update timestamp timestamp = str(actions[0]["timestamp"]) else: if 'errorSummary' in jsonData: logger.error("Response error: " + jsonData['errorSummary']) else: logger.error("Response error. No messages found") logger.error(msgsData) return else: logger.error("Response error. Empty data or payload") logger.error(msgsData) logger.info("Retrying in {} seconds".format(self.ERROR_WAIT)) time.sleep(self.ERROR_WAIT) continue offset += chunkSize if limit != 0 and len(messages) >= limit: break time.sleep(self.REQUEST_WAIT) if merge: logger.info( "Successfully merged {} new messages".format(numMergedMsgs)) logger.info("Conversation total message count = {}".format( len(messages))) else: logger.info( "Conversation scraped successfully. {} messages retrieved". format(len(messages))) self.writeMessages(messages)
def print_config(self): logger.info("=============== global config ===============") logger.info("queue length : " + str(self.queue_len)) logger.info("population size : " + str(self.popsize)) logger.info("mutate probability : " + str(self.prob_mutate)) logger.info("the number of process (for multiprocessing) : " + str(self.num_processor)) logger.info("coverage differential threshold : " + str(self.coverage_threshold)) logger.info("enable transformation based on filter : " + str(self.enable_filters)) logger.info("enable optimize : " + str(self.enable_optimize)) logger.info("robust_threshold : " + str(self.robust_threshold)) logger.info("=============== translation config ===============") logger.info("rotation range : " + str(self.rotation_range)) logger.info("translate range : " + str(self.translate_range)) logger.info("shear range : " + str(self.shear_range)) if self.enable_filters: logger.info("zoom range : " + str(self.zoom_range)) logger.info("blur range : " + str(self.blur_range)) logger.info("brightness range : " + str(self.brightness_range)) logger.info("contrast range : " + str(self.contrast_range)) logger.info("mutate step (for genetic algorithm) : " + str(self.translation_step)) logger.info("=============== Training Start ===============")
def run(self): ''' @brief: this is the standard function to be called by the "multiprocessing.Process" @NOTE: check the parallel_util.py for definitions ''' self.build_models() # load the model if needed if self.args.ckpt_name is not None: self.restore_all() # the main training process while True: next_task = self.task_q.get() # Kill the learner if next_task is None or next_task == parallel_util.END_SIGNAL: self.task_q.task_done() break # Get the policy network weights elif next_task == parallel_util.START_SIGNAL: # just get the params of the network, no learning process self.task_q.task_done() self.result_q.put(self.get_policy()) # Updating the network else: if self.args.test: paths = next_task paths.pop() episoderewards = np.array( [path["rewards"].sum() for path in paths]) self.task_q.task_done() stats = {"avg_reward": episoderewards.mean()} logger.info(stats) return_data = { 'policy_weights': self.get_policy(), 'stats': stats, 'totalsteps': self.args.max_timesteps + 100, 'iteration': self.get_iteration_count(), 'std_reward': episoderewards.std(), "avg_reward": episoderewards.mean(), "max_reward": np.amax(episoderewards), "min_reward": np.amin(episoderewards), "median_reward": np.median(episoderewards), } self.result_q.put(return_data) # the actual training step else: paths = next_task stats = self.update_parameters(paths) self.task_q.task_done() return_data = { 'policy_weights': self.get_policy(), 'stats': stats, 'totalsteps': self.timesteps_so_far, 'iteration': self.get_iteration_count() } self.result_q.put(return_data)
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) total_loss, per_example_loss, logits = function_builder.get_race_loss( FLAGS, features, is_training) #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) logger.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: assert FLAGS.num_hosts == 1 def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) eval_input_dict = { 'labels': label_ids, 'predictions': predictions, 'weights': is_real_example } accuracy = tf.metrics.accuracy(**eval_input_dict) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { 'eval_accuracy': accuracy, 'eval_loss': loss} is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) #### Constucting evaluation TPUEstimatorSpec with new cache. label_ids = tf.reshape(features['label_ids'], [-1]) metric_args = [per_example_loss, label_ids, logits, is_real_example] if FLAGS.use_tpu: eval_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=(metric_fn, metric_args), scaffold_fn=scaffold_fn) else: eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=metric_fn(*metric_args)) return eval_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: #### Creating host calls host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op) return train_spec
def exec_service_mysqld_start(): """使用service启动mysql服务""" logger.info('尝试使用service启动mysql') cmd = ['service', 'mysqld', 'start'] util.exec.exec_cmd(cmd, exec_start_mysqld.__doc__)
def main(_): logger.set_verbosity(logger.INFO) #### Validate flags if FLAGS.save_steps is not None: FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not tf.gfile.Exists(FLAGS.output_dir): tf.gfile.MakeDirs(FLAGS.output_dir) sp = spm.SentencePieceProcessor() sp.Load(FLAGS.spiece_model_file) def tokenize_fn(text): text = preprocess_text(text, lower=FLAGS.uncased) return encode_ids(sp, text) # TPU Configuration run_config = model_utils.configure_tpu(FLAGS) model_fn = get_model_fn() spm_basename = os.path.basename(FLAGS.spiece_model_file) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_tpu: estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) else: estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config) if FLAGS.do_train: train_file_base = "{}.len-{}.train.tf_record".format( spm_basename, FLAGS.max_seq_length) train_file = os.path.join(FLAGS.output_dir, train_file_base) if not tf.gfile.Exists(train_file) or FLAGS.overwrite_data: train_examples = get_examples(FLAGS.data_dir, "train") random.shuffle(train_examples) file_based_convert_examples_to_features( train_examples, tokenize_fn, train_file) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps) if FLAGS.do_eval: eval_examples = get_examples(FLAGS.data_dir, FLAGS.eval_split) logger.info("Num of eval samples: {}".format(len(eval_examples))) # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). # # Modified in XL: We also adopt the same mechanism for GPUs. while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file_base = "{}.len-{}.{}.tf_record".format( spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) if FLAGS.high_only: eval_file_base = "high." + eval_file_base elif FLAGS.middle_only: eval_file_base = "middle." + eval_file_base eval_file = os.path.join(FLAGS.output_dir, eval_file_base) file_based_convert_examples_to_features( eval_examples, tokenize_fn, eval_file) assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=True) ret = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps) # Log current result logger.info("=" * 80) log_str = "Eval | " for key, val in ret.items(): log_str += "{} {} | ".format(key, val) logger.info(log_str) logger.info("=" * 80)
def build_models(self, build_sampler=True): logger.info('Building the text to image GAN model') # 1. real image and right text with tf.variable_scope(""): self.d_network_rr = GAN.img_discriminator(self.config, stage=self.stage) self.d_network_rr.build_models(self.real_img, self.real_sen_rep) self.score_r = self.d_network_rr.get_score() self.loss_r = tf.reduce_mean( compat_tf.sigmoid_cross_entropy_with_logits(logits=self.score_r, labels=tf.ones_like( self.score_r))) logger.info('loss from real image and right text generated') # 2. real image and wrong text with tf.variable_scope("", reuse=True): self.d_network_rw = GAN.img_discriminator(self.config, stage=self.stage) self.d_network_rw.build_models(self.real_img, self.wrong_sen_rep) self.score_rw = self.d_network_rw.get_score() self.loss_w = tf.reduce_mean( compat_tf.sigmoid_cross_entropy_with_logits(logits=self.score_rw, labels=tf.zeros_like( self.score_rw))) logger.info('loss from real image and wrong text generated') # 3. fake image and right text with tf.variable_scope(''): self.g_network = GAN.img_generator(self.config, stage=self.stage) self.g_network.build_image_generator(self.noise_input, self.real_sen_rep) self.fake_img = self.g_network.get_fake_image() with tf.variable_scope("", reuse=True): self.d_network_wr = GAN.img_discriminator(self.config, stage=self.stage) self.d_network_wr.build_models(self.fake_img, self.real_sen_rep) self.fr_score = self.d_network_wr.get_score() self.loss_f = tf.reduce_mean( compat_tf.sigmoid_cross_entropy_with_logits(logits=self.fr_score, labels=tf.zeros_like( self.fr_score))) logger.info('loss from fake image and right text generated') # the loss of generator and the discriminator self.loss_d = self.loss_r + self.loss_f + self.loss_w self.loss_g = tf.reduce_mean( compat_tf.sigmoid_cross_entropy_with_logits(logits=self.fr_score, labels=tf.ones_like( self.fr_score))) # build the sampler if build_sampler: with tf.variable_scope('', reuse=True): self.sample_network = GAN.img_generator(self.config, stage='test') self.sample_network.build_image_generator( self.noise_input, self.real_sen_rep) self.sample_img = self.sample_network.get_fake_image() return
def frameworkMessage(self, driver, executorId, slaveId, message): """Echo a framework message.""" logger.info("Received framework message: %s", message)
def cw_random_word_attack(): cw = CarliniL2_untargeted_qa(debug=args.debugging) criterion = nn.CrossEntropyLoss() loss = 0 adv_loss = 0 targeted_success = 0 untargeted_success = 0 adv_text = [] answers = dict() adv_answers = dict() backup_params = EMA(0) for name, param in model.named_parameters(): if param.requires_grad: backup_params.register(name, param.data) param.data.copy_(ema.get(name)) tot = 0 for batch in tqdm(iter(data.dev_iter), total=1000): p1, p2 = model(batch) orig_answer, orig_s_idx, orig_e_idx = write_to_ans( p1, p2, batch, answers) batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() append_info = append_random_input(batch) allow_idxs = append_info['allow_idx'] batch_start_target = torch.LongTensor([0]).to(device) batch_end_target = torch.LongTensor([0]).to(device) input_embedding = model.word_emb(batch.c_word[0]) cw_mask = np.zeros(input_embedding.shape).astype(np.float32) cw_mask = torch.from_numpy(cw_mask).float().to(device) for bi, allow_idx in enumerate(allow_idxs): cw_mask[bi, np.array(allow_idx)] = 1 cw.wv = model.word_emb.weight cw.inputs = batch cw.mask = cw_mask cw.batch_info = append_info cw.num_classes = append_info['tot_length'] # print(transform(to_list(batch.c_word[0][0]))) cw.run(model, input_embedding, (batch_start_target, batch_end_target)) # re-test for bi, allow_idx in enumerate(allow_idxs): if bi in cw.o_best_sent: for i, idx in enumerate(allow_idx): batch.c_word[0].data[bi, idx] = cw.o_best_sent[bi][i] p1, p2 = model(batch) adv_answer, adv_s_idx, adv_e_idx = write_to_ans( p1, p2, batch, adv_answers) batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) adv_loss += batch_loss.item() for bi, (start_target, end_target) in enumerate( zip(batch_start_target, batch_end_target)): start_output = adv_s_idx end_output = adv_e_idx targeted_success += int( compare(start_output, start_target.item(), end_output, end_target.item())) untargeted_success += int( compare_untargeted(start_output, start_target.item(), end_output, end_target.item())) for i in range(len(allow_idxs)): try: logger.info(("adv:", transform(cw.o_best_sent[i]))) adv_text.append({ 'added_text': transform(cw.o_best_sent[i]), 'adv_text': transform(to_list(batch.c_word[0][0])), 'qas_id': batch.id[i], 'adv_predict': (orig_s_idx, orig_e_idx), 'orig_predict': (adv_s_idx, adv_e_idx), 'Orig answer:': orig_answer, 'Adv answer:': adv_answer }) joblib.dump(adv_text, root_dir + '/adv_text.pkl') except: adv_text.append({ 'adv_text': transform(to_list(batch.c_word[0][0])), 'qas_id': batch.id[i], 'adv_predict': (orig_s_idx, orig_e_idx), 'orig_predict': (adv_s_idx, adv_e_idx), 'Orig answer:': orig_answer, 'Adv answer:': adv_answer }) joblib.dump(adv_text, root_dir + '/adv_text.pkl') continue # for batch size = 1 tot += 1 logger.info(("orig predict", (orig_s_idx, orig_e_idx))) logger.info(("adv append predict", (adv_s_idx, adv_e_idx))) logger.info(("targeted successful rate:", targeted_success)) logger.info(("untargetd successful rate:", untargeted_success)) logger.info(("Orig answer:", orig_answer)) logger.info(("Adv answer:", adv_answer)) logger.info(("tot:", tot)) for name, param in model.named_parameters(): if param.requires_grad: param.data.copy_(backup_params.get(name)) with open(options.prediction_file, 'w', encoding='utf-8') as f: print(json.dumps(answers), file=f) with open(options.prediction_file + '_adv.json', 'w', encoding='utf-8') as f: print(json.dumps(adv_answers), file=f) results = evaluate.main(options) logger.info(tot) logger.info(("adv loss, results['exact_match'], results['f1']", loss, results['exact_match'], results['f1'])) return loss, results['exact_match'], results['f1']
def queue_segments(jobs, conf, connection): """Make a new Mesos job for every segment.""" try: has_enough_segs = False for segment in determine_segments(**conf): completed_scene_list = [] segment_length = len(segment) if segment_length >= conf.minscenesperseg: has_enough_segs = True logger.info("Segment length: %d", len(segment)) logger.info("Segment: %s", segment) for scene_record in segment: # Build list to be used in SQL Insert statement row = (scene_record['LANDSAT_PRODUCT_ID'], scene_record['FILE_LOC']) completed_scene_list.append(row) # set 'BLANK' to 'INQUEUE' processing status db.set_scene_to_inqueue(connection, scene_record['LANDSAT_PRODUCT_ID']) logger.info( "Scenes inserted into ARD_PROCESSED_SCENES table:" " %s", completed_scene_list) db.processed_scenes(connection, completed_scene_list) # WARNING: This assumes subdirectories are desired subdirdest = { 'LT04': 'tm', 'LT05': 'tm', 'LE07': 'etm', 'LC08': 'oli_tirs' } final_output = (os.path.join( conf.outdir, "lta_incoming", subdirdest[segment[0]['SATELLITE']], 'ARD_Tile')) # Build the Docker entrypoint command. cmd = ' '.join([ 'cli.py', "'" + json.dumps(segment, sort_keys=True, default=str) + "'", final_output ]) job_id = format_job_id(segment) logger.debug('Command to clip: [%s]', cmd) # Compile the job information. job = Job() job.cpus = conf.cpus job.disk = conf.disk job.mem = conf.memory job.command = cmd job.job_id = job_id jobs.append(job) logger.info('Queuing job id: %s', job_id) if not has_enough_segs: logger.info("No segments meet the %d scenes per segment minimum", conf.minscenesperseg) return SUCCESS except Exception: logger.exception('Unable to fetch segments!') return ERROR
def shutdown(self, signum, frame): """Call for signal interrupts.""" self.flag = True logger.info("Shutdown requested.")
def cw_rand_words_attack(data_val): adv_correct = 0 targeted_success = 0 untargeted_success = 0 orig_correct = 0 orig_append_correct = 0 tot = 0 adv_pickle = [] cw = CarliniL2_random(debug=args.debugging) for batch in get_random_word_batch(data_val): data = batch['data'] seq_len = batch['seq_len'] label = batch['label'] batch_add_start = batch['add_start'] batch_add_end = batch['add_end'] attack_targets = batch['attack_targets'] add_sents = batch['add_sents'] allow_idxs = batch['allow_idx'] tot += len(label) input_embedding = model.bert.embeddings.word_embeddings(data) cw_mask = np.zeros(input_embedding.shape).astype(np.float32) for bi, allow_idx in enumerate(allow_idxs): cw_mask[bi, np.array(allow_idx)] = 1 cw_mask = torch.from_numpy(cw_mask).float().to(device) cw.wv = model.bert.embeddings.word_embeddings.weight cw.mask = cw_mask cw.seq = data cw.batch_info = batch cw.seq_len = seq_len adv_data = cw.run(model, input_embedding, attack_targets) adv_seq = torch.tensor(batch['data']).to(device) for bi, allow_idx in enumerate(allow_idxs): if bi in cw.o_best_sent: for i, idx in enumerate(allow_idx): adv_seq.data[bi, idx] = cw.o_best_sent[bi][i] out = model(adv_seq, seq_len)['pred'] prediction = torch.max(out, 1)[1] orig_correct += batch['orig_correct'].item() orig_append_correct += batch['orig_append_correct'].item() adv_correct += torch.sum((prediction == label).float()).item() targeted_success += torch.sum( (prediction == attack_targets).float()).item() untargeted_success += untargeted_success_rate(prediction, label) for i in range(len(adv_seq)): adv_pickle.append({ 'raw_text': transform(adv_seq[i]), 'label': label[i].item() }) try: # logger.info(("orig:", transform(add_sents[i][1:]))) logger.info(("adv:", transform(cw.o_best_sent[i]))) except: continue logger.info(("orig_correct:", orig_correct)) logger.info(("orig_append_correct:", orig_append_correct)) logger.info(("adv_correct:", adv_correct)) logger.info(("targeted successful rate:", targeted_success)) logger.info(("untargetd successful rate:", untargeted_success)) logger.info(("tot:", tot)) joblib.dump(adv_pickle, root_dir + '/adv_text.pkl') logger.info(("orig_correct:", orig_correct / tot)) logger.info(("orig_append_correct:", orig_append_correct / tot)) logger.info(("adv_correct:", adv_correct / tot)) logger.info(("targeted successful rate:", targeted_success / tot)) logger.info(("untargetd successful rate:", untargeted_success / tot))
def start(self): logger.info('正在监听仓库 - {user}/{repo} 更新'.format(user=self.__user, repo=self.__repo)) super().start()
def run_forever(conf): """Entrypoint to keep the framework running until terminated.""" logger.info('******************Start************') logger.debug('DB connection: %s', conf.l2_db_con) logger.debug("Minimum Senes Per Seg: %s", conf.minscenesperseg) logger.debug('Segment query: %s', conf.segment_query) global shutdown db.reset_records(db.connect(conf.l2_db_con)) # Establish framework, executor, and authentication credentials framework = mesos_pb2.FrameworkInfo() framework.user = conf.framework_user framework.name = "ARD Tile Framework" framework.principal = conf.mesos_principal framework.role = conf.mesos_role executor = mesos_pb2.ExecutorInfo() executor.executor_id.value = "default" executor.name = "ARD Tile executor" implicit_acks = 1 scheduler = ArdTileScheduler(implicit_acks, executor, conf) if not conf.disable_creds: logger.info(" MESOS creds ENABLED") credential = mesos_pb2.Credential() credential.principal = conf.mesos_principal credential.secret = conf.mesos_secret driver = mesos.native.MesosSchedulerDriver(scheduler, framework, conf.master, implicit_acks, credential) else: logger.info(" MESOS creds disabled") driver = mesos.native.MesosSchedulerDriver(scheduler, framework, conf.master, implicit_acks) shutdown = Shutdown() def run_driver_async(): """Thread for async communication with Mesos offers.""" # driver.run() blocks, so run it in a separate thread. status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1 driver.stop() sys.exit(status) framework_thread = Thread(target=run_driver_async, args=()) framework_thread.start() while framework_thread.is_alive(): # If a shutdown has been requested, suppress offers and wait for the # framework thread to complete. if shutdown.flag: logger.info("Shutdown requested....") driver.suppressOffers() while framework_thread.is_alive(): logger.debug("Thread alive, sleep 5....") time.sleep(5) break # If the job queue is empty, get work. if (not scheduler.jobs and queue_segments(scheduler.jobs, conf, db.connect(conf.l2_db_con)) == ERROR): driver.stop(True) sys.exit(1) # If there's no new work to be done or the max number of jobs are # already running, suppress offers and wait for some jobs to finish. if (not scheduler.jobs or not scheduler.scheduling_allowed()): logger.info("No jobs or scheduling not allowed....") driver.suppressOffers() while not scheduler.scheduling_allowed(): logger.debug("Scheduling not alive, sleep 20....") time.sleep(20) while not scheduler.jobs: if queue_segments(scheduler.jobs, conf, db.connect(conf.l2_db_con)) == ERROR: driver.stop(True) sys.exit(1) time.sleep(20) driver.reviveOffers()
def run(): logger.info("using device: {}".format(config.DEVICE)) train_data = process_raw_data() train_list, test_list = train_test_split(train_data, test_size=0.2, random_state=34) # 加载GPT2模型 model, n_ctx = create_model(True) model.to(config.DEVICE) # 是否使用多块GPU进行并行运算: 可以选择要使用哪几块显卡来进行训练 multi_gpu = False if torch.cuda.is_available() and torch.cuda.device_count() > 1: logger.info("Using more than one GPUs to train...") os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = config.DEVICE_NUM model = DataParallel( model, device_ids=[int(i) for i in config.DEVICE_NUM.split(",")]) multi_gpu = True # 记录模型参数数量 num_parameters = sum( [parameter.numel() for parameter in model.parameters()]) logger.info("number of model parameters: {}".format(num_parameters)) # 加载数据 logger.info("loading training data") train_dataset = DialogueDataset(train_list, n_ctx) batch_num = len(train_dataset) // config.BATCH_SIZE test_dataset = DialogueDataset(test_list, n_ctx) test_batch_num = len(test_dataset) // config.BATCH_SIZE train_data_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=4, collate_fn=collate_fn) test_data_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=1, collate_fn=collate_fn) # 计算所有epoch进行参数优化的总步数total_steps total_steps = int( len(train_data_loader) * config.EPOCHS / config.BATCH_SIZE / config.GRADIENT_ACCUMULATION) logger.info('total training steps = {}'.format(total_steps)) # 设置优化器,并且在初始训练时,使用warmup策略 optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE, correct_bias=True) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.WARM_STEPS, num_training_steps=total_steps) logger.info("start training...") best_loss = 100 best_accuracy = 0 for epoch in range(config.EPOCHS): train_fn(model, train_data_loader, optimizer, scheduler, epoch, batch_num, multi_gpu) loss, accuracy = eval_fn(model, test_data_loader, test_batch_num, multi_gpu) if loss < best_loss or accuracy > best_accuracy: logger.info('saving model for epoch {}, best loss: {}'.format( epoch + 1, loss)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(config.MODEL_PATH) best_loss = loss best_accuracy = accuracy
parser.add_argument(action="store", dest='output_path', type=str, metavar='PATH') parser.add_argument('-c', '--config', action="store", dest='config_file', default='/ARD_Clip.conf', required=False, type=str, metavar='PATH') return vars(parser.parse_args()) if __name__ == '__main__': args = parse_cli() conf = config.read_config(args['config_file']) setup_logger(level='debug' if conf.debug else 'info') logger.info('******************Start************') logger.info(' DB connection: %s', conf.connstr) logger.info(" Version: %s", conf.version) logger.info(" Debug: %s", conf.debug) logger.info('segment: %s', args['segment']) logger.info('output path: %s', args['output_path']) process_segments(args['segment'], args['output_path'], conf) logger.info('..................Normal End............')