def executeRequest(self, requestData):
        """Executes the POST request and retrieves the correspondent response content.
        Request headers are generated here
        :return: the response content
        """
        headers = {"Host": "www.facebook.com",
                   "Origin":"http://www.facebook.com",
                   "Referer":"https://www.facebook.com",
                   "accept-encoding": "gzip,deflate",
                   "accept-language": "en-US,en;q=0.8",
                   "cookie": self._cookie,
                   "pragma": "no-cache",
                   "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.122 Safari/537.36",
                   "content-type": "application/x-www-form-urlencoded",
                   "accept": "*/*",
                   "cache-control": "no-cache"}

        url = "https://www.facebook.com/ajax/mercury/thread_info.php"

        start = time.time()
        response = requests.post(url, data=requestData, headers=headers)
        end = time.time()
        logger.info("Retrieved in {0:.2f}s".format(end-start))

        #Remove additional leading characters
        msgsData = response.text[9:]
        return  msgsData
Exemple #2
0
def download(youtube_id):
    temp_dir = tempfile.mkdtemp()

    # Fake up a YouTube URL since youtube-dl expects one
    youtube_url = "http://www.youtube.com/watch?v={0}".format(youtube_id)

    video_filename_template = youtube_id + ".%(ext)s"
    video_path_template = os.path.join(temp_dir, video_filename_template)

    # Download a copy of the video from YouTube, but limit the resolution to
    # no more than "720p".  For details on the '--format' option, see
    # https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
    command_args = ["python", "youtube-dl/youtube-dl.py", "--format",
                    "best[height<=720]", "-icw", "-o", video_path_template,
                    youtube_url]
    results = popen_results(command_args)
    logger.info(results)

    files = os.listdir(temp_dir)
    if not files:
        return
    assert len(files) == 1
    video_path = os.path.join(temp_dir, files[0])
    logger.info(video_path)

    return video_path
def parseConversation(convPath, out, authors):
    """ Parse all "relevant" messages and related attributes, for then saving them to a text file.

    Current message format result example:
    2012.06.17 15:27:42 SENDER_1 Message text from sender1

    Authors should be a dict to provide a correspondence between the IDs as present in the msgData
    and eventually preferred aliases. If a key is not present, the ID itself is used as alias for all
    successive messages.
    """
    with open(convPath, encoding='utf-8') as data_file:
        actions = json.load(data_file)

    f = open(out, "w", encoding='utf-8')
    #TODO consider different type of messages, like when a call or Stickers
    #Stickes leaves and empty message given that there is no textual content
    #log:phone-call, log:video-call
    messages = []
    for action in actions:
        if "log_message_type" in action:
            logger.info("Skipping message of type: " + action["log_message_type"])
            continue
        msg = parseMessage(action, authors)
        if msg:
            messages.append(msg)
    #FIXME happened that number of lines exceeds previously reported number of messages retrieved
    for msg in messages:
        f.write(msg + "\n")
Exemple #4
0
def handle_nick(data, match, client, channels):
    """
    When a user changes their nick, tell everyone in all the channels they're in
    about it.

    '^NICK (?P<nick>.*)'

    :type data: str
    :type match: dict
    :type client: Client
    :type channels: list
    """

    newnick = match['nick']

    logger.info("Set new user's nick to '{newnick}'", newnick=newnick)

    client.send(Protocol.Nick.response(client.nick, newnick))

    announce = Protocol.Nick.announce(client, newnick)
    for cl in set(chain.from_iterable(chan.clients for chan in client.channels)):
        if cl is not client:
            cl.send(announce)

    client.nick = newnick
Exemple #5
0
def install_it(pkg, system='debian'):
    """
    Install a package only if it isn't already installed.
    If it was impossible the installation of the package the function return False,
    True otherwise.

    Possible values of system are:
    *    debian
    *    archlinux

    """
    if system=='debian':
        cmd = 'sudo apt-get --assume-yes install '+pkg
    elif system=='archlinux':
        cmd = 'sudo pacman -S '+pkg
    info('Installing the package '+pkg+' ...')
    if is_pack_installed(pkg):
        return True
    
    try:
        (st, out) =  subprocess.getstatusoutput(cmd)
    except OSError:
        error('Maybe apt is not installed in this system.')
        st = 32512 # This the error code of 'command not found'
    return st==0
Exemple #6
0
def response_xml(xml):
    xml = [outer.split('</') for outer in [inner.split('>') for inner in xml.split('\n') if 'aspsms' in inner][-1] if '</' in outer]
    result = dict()
    for val, key in xml:
        if val:
            result[key] = val
            logger.info('aspsms %s: %s' %(key, val))
    return result
    def connectionLost(self, reason):
        logger.info('Lost client "{nick}"', nick=self._client.nick)
        announce = Protocol.quit(self._client, "Connection lost")

        for channel in self._client.channels:
            channel.clients.remove(self._client)
            channel.send(announce)

        self.factory.clients.remove(self)
Exemple #8
0
def copy_legacy_content_to_new_location(youtube_id):
    """Copies the MP4 & PNG files from a legacy-format video in the S3 converted bucket to the new naming scheme."""
    for key in converted_bucket.list(prefix="{0}/".format(youtube_id)):
        legacy_match = re_legacy_video_key_name.match(key.name)
        assert legacy_match is not None
        assert legacy_match.group(1) == youtube_id
        dest_key = "{0}.mp4/{1}".format(youtube_id, legacy_match.group(2))
        logger.info("Copying {0} to {1}".format(key.name, dest_key))
        key.copy(converted_bucket.name, dest_key, preserve_acl=True)
Exemple #9
0
def mail(params):

    from mail import send_mail

    logger.info('cli sends mail')
    response = send_mail(params.to, params.message, subject=params.subject, subjecttag=params.subjecttag, subjectdate=params.subjectd, cc=params.cc, bcc=params.bcc, files=params.files, sender=params.sender, footer=params.footer)
    if response == True:
        parser.exit(status=0, message='success, mail sent\n')
    else:
        parser.exit(status=-1, message='fail: %s\n' %(response))
Exemple #10
0
def twitter(params):

    from twitter import send_tweet

    logger.info('cli sends tweet')
    response = send_tweet(params.message, mention=params.mention)
    if response == True:
        parser.exit(status=0, message='success, tweet sent\n')
    else:
        parser.exit(status=-1, message='fail: %s\n' %(response))
Exemple #11
0
def aspsms(params):

    from aspsms import send_aspsms

    logger.info('cli sends aspsms')
    response = send_aspsms(params.to, params.message, originator=params.origin, flashing=params.flashing, maxchars=params.maxchars)
    if response == True:
        parser.exit(status=0, message='success, aspsms sent\n')
    else:
        parser.exit(status=-1, message='fail: %s\n' %(response))
    def run(self):
        client = ConnectFactory().getConnect("redis",self.config)
        for msg in self.consumer:
            kafkamsg = self._decodemsg(msg)
            try:
                logger.info("message handling(%s)" % kafkamsg)
                jsondata = json.loads(kafkamsg['rawdata'])
                ObjectFactory.fromjson(jsondata["message"]).execute(client)

            except:
                logger.error("message execute error(%s)" % jsondata)
Exemple #13
0
def update_download_available(youtube_id, available_formats):

    url = "http://www.khanacademy.org/api/v1/videos/%s/download_available" % youtube_id
    params = {
        'formats': ','.join(available_formats),
        'key': secrets.ka_download_available_secret,
    }

    response = urllib2.urlopen(url, data=urllib.urlencode(params))
    logger.info(response.read())

    return response.code == 200
Exemple #14
0
def send_mail(to, messagetext, subject=None, **args):

    cc = args.get('cc', [])
    bcc = args.get('bcc', [])
    recipients = list(chain(to, cc, bcc))
    sender = args.get('sender', getconf('email_sender'))
    if not sender: sender = getconf('email_sender')
    footer = args.get('footer', getconf('email_footer'))
    if not footer: footer = getconf('email_footer')
    subjecttag = args.get('subjecttag', getconf('email_defaulttag'))
    subjectdate = args.get('subjectdate', getconf('email_subject_date'))
    files = args.get('files', [])

    logger.info('~' * 23)
    logger.info('sending new mail using %s:\n%d recipients ~ %d cc, %d bcc, %d files' %(sender, len(recipients), len(cc), len(bcc), len(files)))

    message = make_header(to, sender, cc, subject, subjecttag, subjectdate)
    message.attach(make_mime_text(messagetext, footer))
    [message.attach(make_mime_file(f)) for f in files]

    session = dialup()

    if session is not None:
        try:
            session.sendmail(sender, recipients, message.as_string().encode('UTF-8'))
        except SMTPException as ex:
            logger.error('smtp error: %s' %(ex))
            return ex
        else:
            logger.info('mail sent')
            return True
        finally:
            ext_log(session.quit(), 'quit')

    logger.info('end mail')
Exemple #15
0
def send_aspsms(to, messagetext, **args):

    originator = args.get('originator', getconf('aspsms_originator'))
    if not originator: originator = getconf('aspsms_originator')
    flashing = args.get('flashing', getconf('aspsms_flashing'))
    maxchars = args.get('maxchars', getconf('aspsms_maxchars'))

    logger.info('~' * 23)
    logger.info('sending new aspsms using %s:\n%d recipients ~ flashing: %s' %(originator, len(to), flashing))

    message = wrap(messagetext, maxchars)

    try:
        for recipient in to:
            for text in message:
                payload = make_xml(recipient, originator, text, flashing)
                response = response_xml(post_xml(payload))
                if not response['ErrorCode'] == '1':
                    raise Exception('aspsms error: %s' %(response['ErrorDescription']))
    except Exception as ex:
        logger.error('error: %s' %(ex))
        return ex
    else:
        logger.info('aspsms sent')
        return True

    logger.info('end aspsms')
Exemple #16
0
def printDelayStatsFor(conv):
    delay = conv.stats.getDelayStats()
    logger.info("##Reply Delay Stats")
    logger.info("Reply delay by sender: ")
    for s, d in delay.items():
        msg = "Between {} and {}".format(s.split(':')[0], s.split(':')[1])
        logger.info('{} : {}'.format(msg, d))
    logger.info('-'*10)
Exemple #17
0
def list_converted_formats():
    """Returns a dict that maps youtube_ids (keys) to a set of available converted formats (values)"""
    converted_videos = defaultdict(set)
    legacy_video_keys = set()
    for key in converted_bucket.list(delimiter="/"):
        video_match = re_video_key_name.match(key.name)
        if video_match is None:
            if re_legacy_video_key_name.match(key.name) is not None:
                legacy_video_keys.add(key.name)
            else:
                logger.warning("Unrecognized key {0} is not in format YOUTUBE_ID.FORMAT/".format(key.name))
        else:
            converted_videos[video_match.group(1)].add(video_match.group(2))
    logger.info("{0} legacy converted videos were ignored".format(len(legacy_video_keys)))
    return converted_videos
Exemple #18
0
def start_converting(youtube_id, s3_url, formats_to_create):

    thumbnail_time = youtube.get_thumbnail_time(youtube_id)
    assert thumbnail_time

    zen = Zencoder(zencoder_api_key)
    outputs = []

    for format_to_create in formats_to_create:
        if format_to_create in output_types():
            outputs += [fxn(youtube_id, thumbnail_time) for fxn in output_types()[format_to_create]]
    
    job_response = zen.job.create(s3_url, outputs=outputs)

    assert job_response.code == 201, job_response.body

    logger.info("Zencoder job created successfully")
Exemple #19
0
def start_converting(youtube_id, s3_url, formats_to_create, base_url=BASE_URL):

    # TODO(csilvers): figure out how to get thumbnail times from youtube APIv3
    # thumbnail_time = youtube.get_thumbnail_time(youtube_id)
    thumbnail_time = None

    zen = Zencoder(zencoder_api_key)
    outputs = []

    for format_to_create in formats_to_create:
        assert format_to_create in output_types(), (format_to_create, output_types())
        outputs += [fxn(youtube_id, thumbnail_time, base_url) for fxn in output_types()[format_to_create]]

    job_response = zen.job.create(s3_url, outputs=outputs)

    assert job_response.code == 201, job_response.body

    logger.info("Zencoder job created successfully")
Exemple #20
0
def blog_article(year, month, day, title):
    created = datetime.datetime(year, month, day)
    para = {"created": created, "created_next_day": created + datetime.timedelta(days=1), "title": title}
    logger.info(
        "SELECT * FROM blog WHERE created >= %(created)s and created < %(created_next_day)s and title = %(title)s"
        % para
    )
    db.execute(
        "SELECT * FROM blog WHERE created >= %(created)s and created < %(created_next_day)s and title = %(title)s", para
    )
    r = db.fetchall()
    for i in r:
        i["source_file"] = os.path.join("blog", i["source_file"] + ".html")
    if r:
        para = {"data": r}
        return render_template("blog/article.html", **para)
    else:
        return render_template("404.html"), 404
Exemple #21
0
def main():
    logger.info("Adding hooks...")
    addhooks()
    logger.info("Done with hooks!")

    logger.info("Setting up factory")
    factory = IRCFactory()
    factory.protocol = ClientConnection

    logger.info("Listening")
    reactor.listenTCP(6667, factory)
    reactor.run()
Exemple #22
0
def download(youtube_id):
    temp_dir = tempfile.mkdtemp()

    # Fake up a YouTube URL since youtube-dl expects one
    youtube_url = "http://www.youtube.com/watch?v={0}".format(youtube_id)

    video_filename_template = youtube_id + ".%(ext)s"
    video_path_template = os.path.join(temp_dir, video_filename_template)

    command_args = ["python", "youtube-dl/youtube-dl.py", "--max-quality", "22", "-icw", "-o", video_path_template, youtube_url]
    results = popen_results(command_args)
    logger.info(results)

    files = os.listdir(temp_dir)
    assert len(files) == 1
    video_path = os.path.join(temp_dir, files[0])
    logger.info(video_path)

    return video_path
Exemple #23
0
def handle_user(data, match, client, channels):
    """
    This is a handshake. We'll blatantly disregard the whole pinging thing,
    because I don't really care. Clients still ping the server as it is.
    We'll see how it works out.

    '^USER (?P<user>[^\s]+) (?P<mode>[^\s]+) (?P<junk>[^\s]+) :(?P<name>.*)'

    :type data: str
    :type match: dict
    :type client: Client
    :type channels: list
    """
    client.name = match['name']
    client.host = match['host']
    logger.info("Set new user's name to '{name}'", name=client.name)
    logger.info("Set new user's host to '{host}'", host=client.host)

    response = Protocol.handshake(client)

    return [line.format(nick=client.nick) for line in response]
Exemple #24
0
def get_or_create_unconverted_source_url(youtube_id):
    matching_keys = list(unconverted_bucket.list(youtube_id))
    matching_key = None

    if len(matching_keys) > 0:
        if len(matching_keys) > 1:
            logger.warning("More than 1 matching unconverted video URL found for video {0}".format(youtube_id))
        matching_key = matching_keys[0]
    else:
        logger.info("Unconverted video not available on s3 yet, downloading from youtube to create it.")

        video_path = youtube.download(youtube_id)
        logger.info("Downloaded video to {0}".format(video_path))

        assert(video_path)

        video_extension = splitext(video_path)[1]
        assert video_extension[0] == "."
        video_extension = video_extension[1:]
        if video_extension not in ["flv", "mp4"]:
            logger.warning("Unrecognized video extension {0} when downloading video {1} from YouTube".format(video_extension, youtube_id))

        matching_key = Key(unconverted_bucket, "{0}/{0}.{1}".format(youtube_id, video_extension))
        matching_key.set_contents_from_filename(video_path)

        os.remove(video_path)
        logger.info("Deleted {0}".format(video_path))

    return "s3://{0}/{1}".format(unconverted_bucket.name, matching_key.name)
Exemple #25
0
def updateswitch(switch_ip,dms_location,redis_server):
   logger.info('start to update switch(%s)' % switch_ip)

   headers = {'content-type': 'application/json'}
   switch = {}
   switch['managementIp'] = switch_ip
   keys = redis_server.hkeys('locations')
   location_id = None
   for key in keys:
      value = redis_server.hget('locations',key)
      if value == dms_location:
         location_id = key
   if not location_id:
     print 'can not find corresponding location id'
     logger.error("cannot find the location id")
     sys.exit(-1)
   switch['locationId'] = location_id
   switch_json = json.dumps(switch)
   logger.info('request:%s' % switch_json)
   r = requests.post('%s/switch' % dso_url,data=switch_json,headers=headers)
   if r.status_code == 201:
      print 'notify dso to bind location and switch successfully.'
   else:
      print 'fail to notify dso to bind location and switch:status_code(%s),content(%s)' % (r.status_code,r.content)
Exemple #26
0
def  deleteport(switch_json):
   logger.info('start to delete port:(%s)' % switch_json)
   headers = {'content-type': 'application/json'}
   r = requests.delete("%s/switch" % dso_url,data=switch_json,headers=headers)
   logger.info(r.status_code)
   logger.info(r.content)
   if r.status_code == 200:
      print 'notify dso to delete ports successfully.'
   else:
      print r.content
      sys.exit(-1)
Exemple #27
0
def createswitch(switch_json):
   logger.info('start to create switch:(%s)' % switch_json)
   headers = {'content-type': 'application/json'}
   r = requests.post("%s/switch" % dso_url,data=switch_json,headers=headers)
   logger.info(r.status_code)
   logger.info(r.content)
   if r.status_code == 201:
      print 'notify dso to create switch successfully.'
   else:
      print r.content
      print 'fail to notify dso to create switch.'
      sys.exit(-1)
Exemple #28
0
def repost_replies(account_name):
    bf = open('.blacklist_%s'%account_name,'a+')
    blacklist = bf.read().splitlines()
    bf.close()

    rp = open('.reposted_%s'%account_name,'a+')
    reposted = rp.read().splitlines()

    account = settings.ACCOUNTS.get(account_name)

    try:
        logging.info('[%s] Getting last mentions offset'%account_name)
        bot = TwitterBot(settings.CONSUMER_KEY,settings.CONSUMER_SECRET,
                         account['key'],account['secret'])
        mentions = []
        try:
            mentions = bot.api.mentions()
            logging.info('[%s] Got %d mentions'%(account_name,len(mentions)))
        except Exception,e:
            logging.error('[%s] Failed to get mentions. %s'%(account_name,e))

        for mess in reversed(mentions):
            try:
                author = mess.author.screen_name
                if str(author) in blacklist:
                    logging.debug('[%s] Author %s blacklisted. Skipping.'%(account_name,str(author)))
                    continue
                if str(mess.id) in reposted:
                    logging.debug('[%s] Message #%s already reposted. Skipping.'%(account_name,str(mess.id)))
                    continue

                message = mess.text.split(' ')
                if message[0] != '@%s'%account_name:
                    continue #not a "@reply"

                trigger = message[1]
                triggers = dict(account['triggers'])
                if trigger not in triggers:
                    logging.warning('[%s] Bad message format, sending DM to author'%account_name)
                    bot.dm(author,account['not_triggered'])
                else:
                    len_params = {'message':'','user':author}
                    mess_len = len(triggers[trigger]%len_params)
                    params = {'message':bot.trim_message(' '.join(message[2:]),mess_len),'user':author}
                    message = triggers[trigger]%params
                    logging.info('[%s] Tweeting message %s'%(account_name,message))
                    bot.tweet(message)
                rp.write('%s\n'%mess.id)
            except Exception,e:
                logging.error('%s'%e)
                continue
Exemple #29
0
def get_or_create_unconverted_source_url(youtube_id):
    matching_keys = list(unconverted_bucket.list(youtube_id))

    # TODO(alpert): How do these .part files get created? They're not real
    # video files and should be ignored.
    matching_keys = [key for key in matching_keys
                     if not key.name.endswith('.part')]
                     
    matching_key = None

    if matching_keys:
        if len(matching_keys) > 1:
            logger.warning("More than 1 matching unconverted video "
                           "URL found for video {0}".format(youtube_id))
        matching_key = matching_keys[0]
    else:
        logger.info("Unconverted video not available on s3 yet, "
                    "downloading from youtube to create it.")

        video_path = youtube.download(youtube_id)
        if not video_path:
            logger.warning("Error downloading video {0}".format(youtube_id))
            return
        logger.info("Downloaded video to {0}".format(video_path))

        video_extension = splitext(video_path)[1]
        assert video_extension[0] == "."
        video_extension = video_extension[1:]
        if video_extension not in ["flv", "mp4"]:
            logger.warning("Unrecognized video extension {0} when downloading "
                           "video {1} from YouTube".format(
                               video_extension, youtube_id))

        matching_key = Key(unconverted_bucket, "{0}/{0}.{1}".format(
            youtube_id, video_extension))
        matching_key.set_contents_from_filename(video_path)

        os.remove(video_path)
        logger.info("Deleted {0}".format(video_path))

    return "s3://{0}/{1}".format(unconverted_bucket.name, matching_key.name)
 def change_baud(self, baud):
     self._close()
     logger.info("Change channel %s from %s to %s" %
                 (self.ser.port, self.ser.baudrate, int(baud)))
     self.ser.baudrate = int(baud)
     return self._open()
    def scrapeConversation(self, merge, offset, timestampOffset, chunkSize, limit, isGroupConversation):
        """Retrieves conversation messages and stores them in a JSON file
        If merge is specified, the new messages will be merged with the previous version of the conversation, if present
        """

        if merge:
            if not os.path.exists(self._directory + "conversation.json"):
                logger.error("Conversation not present. Merge operation not possible")
                return
            with open(self._directory + "conversation.json") as conv:
                convMessages = json.load(conv)
                numMergedMsgs = 0

        if not os.path.exists(self._directory):
            os.makedirs(self._directory)

        logger.info("Starting scraping of conversation {}".format(self._convID))

        messages = []
        msgsData = ""
        timestamp = "" if timestampOffset == 0 else str(timestampOffset)
        while self.CONVERSATION_ENDMARK not in msgsData:
            requestChunkSize = chunkSize if limit <= 0 else min(chunkSize, limit-len(messages))
            reqData = self.generateRequestData(offset, timestamp, requestChunkSize, isGroupConversation)
            logger.info("Retrieving messages " + str(offset) + "-" + str(requestChunkSize+offset))
            msgsData = self.executeRequest(reqData)
            jsonData = json.loads(msgsData)

            if jsonData and ('payload' in jsonData) and jsonData['payload']:
                if ('actions' in jsonData['payload']) and jsonData['payload']['actions']:
                    actions = jsonData['payload']['actions']

                    #case when the last message already present in the conversation
                    #is older newer than the first one of the current retrieved chunk
                    if merge and convMessages[-1]["timestamp"] > actions[0]["timestamp"]:
                        for i, action in enumerate(actions):
                            if convMessages[-1]["timestamp"] == actions[i]["timestamp"]:
                                numMergedMsgs = len(actions[i+1:-1]) + len(messages)
                                messages = convMessages + actions[i+1:-1] + messages
                                break
                        break

                    #We retrieve one message two times, as the first one of the previous chunk
                    #and as the last one of the new one. So we here remove the duplicate,
                    #but only once we already retrieved at least one chunk
                    if len(messages) == 0:
                        messages = actions
                    else:
                        messages = actions[:-1] + messages

                    #update timestamp
                    timestamp = str(actions[0]["timestamp"])
                else:
                    if 'errorSummary' in jsonData:
                        logger.error("Response error: " + jsonData['errorSummary'])
                    else:
                        logger.error("Response error. No messages found")
                        logger.error(msgsData)
                    return
            else:
                logger.error("Response error. Empty data or payload")
                logger.error(msgsData)
                logger.info("Retrying in " + str(self.ERROR_WAIT) + " seconds")
                time.sleep(self.ERROR_WAIT)
                continue

            offset += chunkSize
            if limit!= 0 and len(messages) >= limit:
                break

            time.sleep(self.REQUEST_WAIT)

        if merge:
            logger.info("Successfully merged {} new messages".format(numMergedMsgs))
            logger.info("Conversation total message count = {}".format(len(messages)))
        else:
            logger.info("Conversation scraped successfully. {} messages retrieved".format(len(messages)))

        self.writeMessages(messages)
Exemple #32
0
def cw_tree_attack():
    cw = CarliniL2_qa(debug=args.debugging)
    criterion = nn.CrossEntropyLoss()
    loss = 0
    tot = 0
    adv_loss = 0
    targeted_success = 0
    untargeted_success = 0
    adv_text = []
    answers = dict()
    adv_answers = dict()

    embed = torch.load(args.word_vector)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    vocab = Vocab(filename=args.dictionary,
                  data=[PAD_WORD, UNK_WORD, EOS_WORD, SOS_WORD])
    generator = Generator(args.test_data, vocab=vocab, embed=embed)
    transfered_embedding = torch.load('bidaf_transfered_embedding.pth')
    transfer_emb = torch.nn.Embedding.from_pretrained(transfered_embedding).to(
        device)
    seqback = WrappedSeqback(embed,
                             device,
                             attack=True,
                             seqback_model=generator.seqback_model,
                             vocab=vocab,
                             transfer_emb=transfer_emb)
    treelstm = generator.tree_model
    generator.load_state_dict(torch.load(args.load_ae))

    backup_params = EMA(0)
    for name, param in model.named_parameters():
        if param.requires_grad:
            backup_params.register(name, param.data)
            param.data.copy_(ema.get(name))

    class TreeModel(nn.Module):
        def __init__(self):
            super(TreeModel, self).__init__()
            self.inputs = None

        def forward(self, hidden):
            self.embedding = seqback(hidden)
            return model(batch, perturbed=self.embedding)

        def set_temp(self, temp):
            seqback.temp = temp

        def get_embedding(self):
            return self.embedding

        def get_seqback(self):
            return seqback

    tree_model = TreeModel()
    for batch in tqdm(iter(data.dev_iter), total=1000):
        p1, p2 = model(batch)
        orig_answer, orig_s_idx, orig_e_idx = write_to_ans(
            p1, p2, batch, answers)
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        loss += batch_loss.item()

        append_info = append_input(batch, vocab)
        batch_add_start = append_info['add_start']
        batch_add_end = append_info['add_end']
        batch_start_target = torch.LongTensor(
            append_info['target_start']).to(device)
        batch_end_target = torch.LongTensor(
            append_info['target_end']).to(device)
        add_sents = append_info['append_sent']

        input_embedding = model.word_emb(batch.c_word[0])
        append_info['tree'] = [generator.get_tree(append_info['tree'])]
        seqback.sentences = input_embedding.clone().detach()
        seqback.batch_trees = append_info['tree']
        seqback.batch_add_sent = append_info['ae_sent']
        seqback.start = append_info['add_start']
        seqback.end = append_info['add_end']
        seqback.adv_sent = []

        batch_tree_embedding = []
        for bi, append_sent in enumerate(append_info['ae_sent']):
            sentences = [
                torch.tensor(append_sent, dtype=torch.long, device=device)
            ]
            trees = [append_info['tree'][bi]]
            tree_embedding = treelstm(sentences, trees)[0][0].detach()
            batch_tree_embedding.append(tree_embedding)
        hidden = torch.cat(batch_tree_embedding, dim=0)
        cw.batch_info = append_info
        cw.num_classes = append_info['tot_length']
        cw.run(tree_model,
               hidden, (batch_start_target, batch_end_target),
               input_token=input_embedding)
        seqback.adv_sent = []

        # re-test
        for bi, (add_start,
                 add_end) in enumerate(zip(batch_add_start, batch_add_end)):
            if bi in cw.o_best_sent:
                ae_words = cw.o_best_sent[bi]
                bidaf_tokens = bidaf_convert_to_idx(ae_words)
                batch.c_word[0].data[bi, add_start:add_end] = torch.LongTensor(
                    bidaf_tokens)
        p1, p2 = model(batch)
        adv_answer, adv_s_idx, adv_e_idx = write_to_ans(
            p1, p2, batch, adv_answers)
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        adv_loss += batch_loss.item()

        for bi, (start_target, end_target) in enumerate(
                zip(batch_start_target, batch_end_target)):
            start_output = adv_s_idx
            end_output = adv_e_idx
            targeted_success += int(
                compare(start_output, start_target.item(), end_output,
                        end_target.item()))
            untargeted_success += int(
                compare_untargeted(start_output, start_target.item(),
                                   end_output, end_target.item()))

        for i in range(len(add_sents)):
            logger.info(("orig:", transform(add_sents[i])))
            try:
                logger.info(("adv:", cw.o_best_sent[i]))
                adv_text.append({
                    'adv_text': cw.o_best_sent[i],
                    'qas_id': batch.id[i],
                    'adv_predict': (orig_s_idx, orig_e_idx),
                    'orig_predict': (adv_s_idx, adv_e_idx),
                    'Orig answer:': orig_answer,
                    'Adv answer:': adv_answer
                })
                joblib.dump(adv_text, root_dir + '/adv_text.pkl')
            except:
                adv_text.append({
                    'adv_text': transform(add_sents[i]),
                    'qas_id': batch.id[i],
                    'adv_predict': (orig_s_idx, orig_e_idx),
                    'orig_predict': (adv_s_idx, adv_e_idx),
                    'Orig answer:': orig_answer,
                    'Adv answer:': adv_answer
                })
                joblib.dump(adv_text, root_dir + '/adv_text.pkl')
                continue
        # for batch size = 1
        tot += 1
        logger.info(("orig predict", (orig_s_idx, orig_e_idx)))
        logger.info(("adv append predict", (adv_s_idx, adv_e_idx)))
        logger.info(("targeted successful rate:", targeted_success))
        logger.info(("untargetd successful rate:", untargeted_success))
        logger.info(("Orig answer:", orig_answer))
        logger.info(("Adv answer:", adv_answer))
        logger.info(("tot:", tot))

    for name, param in model.named_parameters():
        if param.requires_grad:
            param.data.copy_(backup_params.get(name))

    with open(options.prediction_file, 'w', encoding='utf-8') as f:
        print(json.dumps(answers), file=f)
    with open(options.prediction_file + '_adv.json', 'w',
              encoding='utf-8') as f:
        print(json.dumps(adv_answers), file=f)
    results = evaluate.main(options)
    logger.info(tot)
    logger.info(("adv loss, results['exact_match'], results['f1']", loss,
                 results['exact_match'], results['f1']))
    return loss, results['exact_match'], results['f1']
Exemple #33
0
        except:
            # result[data['qas_id']] = [' '.join(transform(data['adv_text']))]
            result[data['qas_id']] = None
    return result


if __name__ == '__main__':
    options = args
    device = torch.device("cuda:{}".format(options.gpu))
    best_model_file_name = "model.bin"
    best_ema = "ema.pth"

    # ===-----------------------------------------------------------------------===
    # Log some stuff about this run
    # ===-----------------------------------------------------------------------===
    logger.info(' '.join(sys.argv))
    logger.info('')
    logger.info(options)

    logger.info('loading SQuAD data...')
    data = SQuAD(options)
    setattr(options, 'char_vocab_size', len(data.CHAR.vocab))
    setattr(options, 'word_vocab_size', len(data.WORD.vocab))
    if options.test_file is not None:
        print("testing")
        setattr(options, 'dataset_file',
                '.data/squad/{}'.format(options.test_file))
    else:
        setattr(options, 'dataset_file',
                '.data/squad/{}'.format(options.dev_file))
    setattr(options, 'prediction_file',
Exemple #34
0
def check_backups_completeness(full_backup_info, increment_backup_info):
    """校验备份完整性"""
    if len(increment_backup_info) == 0:
        logger.info('进行全量恢复')
        logger.info('备份完整')
        return True
    elif len(increment_backup_info) > 0:
        logger.info('进行全量恢复 + 增量恢复')
        checkpoint_lsn = full_backup_info[0]['to_lsn']
        for ibi in increment_backup_info:
            if checkpoint_lsn != ibi['from_lsn']:
                logger.info('备份不完整')
                logger.info('id为' + ibi['id'] + '的增量备份信息出错')
                sys.exit(1)
            else:
                checkpoint_lsn = ibi['to_lsn']
        logger.info('备份完整')
        return True
Exemple #35
0
def exec_start_mysqld_safe(mysql_config_path):
    """使用mysqld_safe启动mysql服务"""
    logger.info('尝试使用mysqld_safe启动mysql')
    cmd = ['mysqld_safe', '--defaults-file=' + mysql_config_path]
    util.exec.exec_cmd(cmd, exec_start_mysqld_safe.__doc__, backgroud=True)
    def update_ppo_parameters(self, paths):
        '''
            @brief: update the ppo
        '''
        # step 1: get the data dict
        ob_normalizer_info = paths.pop()
        feed_dict = self.prepared_network_feeddict(paths)

        # step 2: train the network
        logger.info('| %11s | %11s | %11s | %11s| %11s|' %
                    ('surr', 'kl', 'ent', 'vf_loss', 'weight_l2'))
        self.timesteps_so_far += self.args.timesteps_per_batch
        for i_epochs in range(self.args.optim_epochs +
                              self.args.extra_vf_optim_epochs):

            minibatch_id_candidate = range(
                feed_dict[self.action_placeholder].shape[0])
            self._npr.shuffle(minibatch_id_candidate)
            # make sure that only timesteps per batch is used
            minibatch_id_candidate = \
                minibatch_id_candidate[: self.args.timesteps_per_batch]
            current_id = 0

            surrogate_epoch, kl_epoch, entropy_epoch, vf_epoch, weight_epoch = \
                [], [], [], [], []
            while current_id + self.args.optim_batch_size <= \
                    len(minibatch_id_candidate) and current_id >= 0:

                # fetch the minidata batch
                sub_feed_dict, current_id, minibatch_id_candidate = \
                    self.construct_minibatchFeeddict_from_feeddict(
                        feed_dict, minibatch_id_candidate, current_id,
                        self.args.optim_batch_size,
                        is_all_feed=self.args.minibatch_all_feed
                    )

                if i_epochs < self.args.optim_epochs:
                    # train for one iteration in this epoch
                    _, i_surrogate_mini, i_kl_mini, i_entropy_mini, \
                        i_weight_mini = self.session.run(
                            [self.update_op, self.surr, self.kl, self.ent,
                                self.weight_decay_loss],
                            feed_dict=sub_feed_dict
                        )

                    # train the value network with fixed network coeff
                    _, i_vf_mini = self.session.run(
                        [self.update_vf_op, self.vf_loss],
                        feed_dict=sub_feed_dict)
                    surrogate_epoch.append(i_surrogate_mini)
                    kl_epoch.append(i_kl_mini)
                    entropy_epoch.append(i_entropy_mini)
                    vf_epoch.append(i_vf_mini)
                    weight_epoch.append(i_weight_mini)
                else:
                    # only train the value function, might be unstable if share
                    # the value network and policy network
                    _, i_vf_mini = self.session.run(
                        [self.update_vf_op, self.vf_loss],
                        feed_dict=sub_feed_dict)
                    vf_epoch.append(i_vf_mini)

            if i_epochs < self.args.optim_epochs:
                surrogate_epoch = np.mean(surrogate_epoch)
                kl_epoch = np.mean(kl_epoch)
                entropy_epoch = np.mean(entropy_epoch)
                vf_epoch = np.mean(vf_epoch)
                weight_epoch = np.mean(weight_epoch)
            else:
                surrogate_epoch = -0.1
                kl_epoch = -0.1
                entropy_epoch = -0.1
                weight_epoch = -0.1
                vf_epoch = np.mean(vf_epoch)

            # if we use kl_penalty, we will do early stopping if needed
            if self.args.use_kl_penalty:
                assert self.args.minibatch_all_feed, logger.error(
                    'KL penalty not available for epoch minibatch training')
                if kl_epoch > 4 * self.args.target_kl and \
                        self.args.minibatch_all_feed:
                    logger.info('Early Stopping')
                    break

            logger.info('| %10.8f | %10.8f | %10.4f | %10.4f | %10.4f |' %
                        (surrogate_epoch, kl_epoch, entropy_epoch, vf_epoch,
                         weight_epoch))

        i_surrogate_total, i_kl_total, i_entropy_total, \
            i_vf_total, i_weight_total = self.session.run(
                [self.surr, self.kl, self.ent,
                    self.vf_loss, self.weight_decay_loss],
                feed_dict=feed_dict
            )

        # step 3: update the hyperparameters of updating
        self.update_adaptive_hyperparams(kl_epoch, i_kl_total)

        # step 4: record the stats
        stats = {}

        episoderewards = np.array([path["rewards"].sum() for path in paths])
        stats["avg_reward"] = episoderewards.mean()
        stats["entropy"] = i_entropy_total
        stats["kl"] = i_kl_total
        stats["surr_loss"] = i_surrogate_total
        stats["vf_loss"] = i_vf_total
        stats["weight_l2_loss"] = i_weight_total
        stats['learning_rate'] = self.current_lr

        if self.args.use_kl_penalty:
            stats['kl_lambda'] = self.current_kl_lambda

        # step 5: record the summary and save checkpoints
        self.record_summary_and_ckpt(paths, stats, ob_normalizer_info)

        return stats
Exemple #37
0
def run():
    logger.info('using device:{}'.format(config.DEVICE))

    # 对话model
    dialogue_model, _ = create_model(pre_trained=True)
    dialogue_model.to(config.DEVICE)
    dialogue_model.eval()

    # 互信息mmi model
    mmi_model, _ = create_model(pre_trained=True, mmi=True)
    mmi_model.to(config.DEVICE)
    mmi_model.eval()

    if not os.path.exists(config.SAVE_SAMPLES_PATH):
        os.makedirs(config.SAVE_SAMPLES_PATH)
    samples_file = open(config.SAVE_SAMPLES_PATH + "/mmi_samples.txt",
                        "a",
                        encoding="utf8")
    samples_file.write("聊天记录: {}\n".format(datetime.now()))
    # 存储聊天记录,每个utterance以token的id的形式进行存储
    history = []
    print('开始和 chatbot 聊天,输入CTRL + Z以退出')

    while True:
        try:
            text = input("user: "******"user: {}\n".format(text))

            history.append(config.TOKENIZER.encode(text))
            input_ids = [config.TOKENIZER.cls_token_id]

            for history_id, history_utter in enumerate(
                    history[-config.MAX_HISTORY_LEN:]):
                input_ids.extend(history_utter)
                input_ids.append(config.TOKENIZER.sep_token_id)

            # 用于批量生成response,维度为(batch_size, token_len)
            input_ids = [
                copy.deepcopy(input_ids) for _ in range(config.BATCH_SIZE)
            ]
            curr_input_tensors = torch.tensor(input_ids).long().to(
                config.DEVICE)

            # 二维数组,维度为(生成的response的最大长度,batch_size),
            # generated[i,j]表示第j个response的第i个token的id
            generated = []

            # 标记是否所有response均已生成结束,若第i个response生成结束,
            # 即生成了sep_token_id,则将i放入finish_set
            finish_set = set()
            # 最多生成max_len个token
            for _ in range(config.MAX_LEN):
                outputs = dialogue_model(input_ids=curr_input_tensors)
                next_token_logits = outputs[0][:, -1, :]
                # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                for index in range(config.BATCH_SIZE):
                    for token_id in set(
                        [token_ids[index] for token_ids in generated]):
                        next_token_logits[index][
                            token_id] /= config.REPETITION_PENALTY
                next_token_logits = next_token_logits / config.TEMPERATURE

                # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                for next_token_logit in next_token_logits:
                    next_token_logit[config.TOKENIZER.convert_tokens_to_ids(
                        "[UNK]")] = -float("Inf")
                filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                        top_k=config.TOP_K,
                                                        top_p=config.TOP_P)
                # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                next_token = torch.multinomial(F.softmax(filtered_logits,
                                                         dim=-1),
                                               num_samples=1)
                # 判断是否有response生成了[SEP],将已生成了[SEP]的response进行标记
                for index, token_id in enumerate(next_token[:, 0]):
                    if token_id == config.TOKENIZER.sep_token_id:
                        finish_set.add(index)
                # 检验是否所有的response均已生成[SEP]
                finish_flag = True  # 是否所有的response均已生成[SEP]的token
                for index in range(config.BATCH_SIZE):
                    if index not in finish_set:  # response批量生成未完成
                        finish_flag = False
                        break
                if finish_flag:
                    break
                generated.append([token.item() for token in next_token[:, 0]])
                # 将新生成的token与原来的token进行拼接
                curr_input_tensors = torch.cat(
                    (curr_input_tensors, next_token), dim=-1)
            candidate_responses = []  # 生成的所有候选response
            for batch_index in range(config.BATCH_SIZE):
                response = []
                for token_index in range(len(generated)):
                    if generated[token_index][
                            batch_index] != config.TOKENIZER.sep_token_id:
                        response.append(generated[token_index][batch_index])
                    else:
                        break
                candidate_responses.append(response)

            # mmi模型的输入
            if config.DEBUG:
                print("candidate response:")
            samples_file.write("candidate response:\n")

            min_loss = float("Inf")
            best_response = ""
            for response in candidate_responses:
                mmi_input_id = [config.TOKENIZER.cls_token_id
                                ]  # 每个input以[CLS]为开头
                mmi_input_id.extend(response)
                mmi_input_id.append(config.TOKENIZER.sep_token_id)
                for history_utter in reversed(
                        history[-config.MAX_HISTORY_LEN:]):
                    mmi_input_id.extend(history_utter)
                    mmi_input_id.append(config.TOKENIZER.sep_token_id)
                mmi_input_tensor = torch.tensor(mmi_input_id).long().to(
                    config.DEVICE)
                out = mmi_model(input_ids=mmi_input_tensor,
                                labels=mmi_input_tensor)

                loss = out[0].item()
                if config.DEBUG:
                    text = config.TOKENIZER.convert_ids_to_tokens(response)
                    print("{} loss:{}".format("".join(text), loss))
                samples_file.write("{} loss:{}\n".format("".join(text), loss))

                if loss < min_loss:
                    best_response = response
                    min_loss = loss
            history.append(best_response)
            text = config.TOKENIZER.convert_ids_to_tokens(best_response)
            print("chatbot:" + "".join(text))

            if config.SAVE_SAMPLES_PATH:
                samples_file.write("chatbot:{}\n".format("".join(text)))
        except KeyboardInterrupt:
            if config.SAVE_SAMPLES_PATH:
                samples_file.close()
            break
Exemple #38
0
 def wrapped(self):
     logger.info("Accessing {}".format(method.__name__))
     return method(self)
Exemple #39
0
def printIntervalStatsFor(start, end, interval, days):
    logger.info("##Conv Interval")
    logger.info("Conversation started: {}".format(str(start)))
    logger.info("Conversation ended: {}".format(str(end)))
    logger.info("Conversation overall duration: {}".format(interval))

    logger.info("{:.0f} days without messages".format(len(days)))
    percentage = (len(days) / (interval.days + 1)) * 100
    logger.info(
        "{0:.2f}% out of the conversation overall days-interval".format(
            percentage))
    #logger.info(days)

    logger.info('-' * 10)
Exemple #40
0
def printLexicalStats(lexicalStatsDf):
    logger.info("##LEXICAL STATS")

    for sender, vals in lexicalStatsDf.iterrows():
        tokensCount, vocabularyCount, lexicalRichness = vals.tolist()
        logger.info("#" + sender)
        logger.info("Tokens count: {:.0f}".format(tokensCount))
        logger.info("Distinct tokens count: {:.0f}".format(vocabularyCount))
        logger.info("Lexical diversity: {0:.5f}".format(lexicalRichness))

    logger.info('-' * 10)
Exemple #41
0
def printBasicLengthStats(basicLengthStatsDf):
    logger.info("##BASIC LENGTH STATS")

    for sender, vals in basicLengthStatsDf.iterrows():
        totalNum, totalLength, avgLegth = vals.tolist()
        logger.info("#" + sender)
        logger.info("Number of messages: {:.0f}".format(totalNum))
        logger.info("Total length: {:.0f}".format(totalLength))
        logger.info("Average length: {0:.2f}".format(avgLegth))

    logger.info('-' * 10)
Exemple #42
0
def printEmoticonsStats(emoticonsStatsDf):
    logger.info("##EMOTICONS STATS")

    for sender, vals in emoticonsStatsDf.iterrows():
        numEmoticons, emoticonsRatio, lenMsgs = vals.tolist()
        logger.info("#" + sender)
        logger.info("Emoticons count: {:.0f}".format(numEmoticons))
        logger.info("Messages total length: {:.0f}".format(lenMsgs))
        logger.info("Ratio: {0:.5f}".format(emoticonsRatio))

    logger.info('-' * 10)
Exemple #43
0
 def registered(self, driver, frameworkId, masterInfo):
     """Upon successful intial registration to Mesos cluster."""
     logger.info("Registered with framework ID %s", frameworkId.value)
Exemple #44
0
def cw_tree_attack(data_val, tree_data):
    adv_correct = 0
    targeted_success = 0
    untargeted_success = 0
    orig_correct = 0
    tot = 0
    orig_append_correct = 0
    adv_pickle = []

    cw = CarliniL2(debug=args.debugging)
    embed = torch.load(args.word_vector)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    vocab = Vocab(filename=args.dictionary,
                  data=[PAD_WORD, UNK_WORD, EOS_WORD, SOS_WORD])
    generator = Generator(args.test_data,
                          vocab=vocab,
                          embed=embed,
                          data_set=data_val)
    bert_transfered_embedding = torch.load('bert_transfered_embedding.pth')
    transfer_emb = torch.nn.Embedding(
        bert_transfered_embedding.size(0),
        bert_transfered_embedding.size(1)).to(device)
    # transfer_emb = torch.nn.Embedding.from_pretrained(bert_transfered_embedding).to(device)
    transfer_emb.weight.data.copy_(bert_transfered_embedding)
    seqback = WrappedSeqback(embed,
                             device,
                             attack=True,
                             seqback_model=generator.seqback_model,
                             vocab=vocab,
                             transfer_emb=transfer_emb)
    treelstm = generator.tree_model
    generator.load_state_dict(torch.load(args.load_ae))

    class TreeModel(nn.Module):
        def __init__(self):
            super(TreeModel, self).__init__()

        def forward(self, hidden):
            self.embedding = seqback(hidden)
            return model(batch['data'],
                         batch['seq_len'],
                         perturbed=self.embedding)['pred']

        def set_temp(self, temp):
            seqback.temp = temp

        def get_embedding(self):
            return self.embedding

        def get_seqback(self):
            return seqback

    tree_model = TreeModel()
    for batch in get_tree_batch(data_val, tree_data, vocab):
        input_embedding = model.bert.embeddings.word_embeddings(batch['data'])
        batch['tree'] = [generator.get_tree(tree) for tree in batch['tree']]
        seqback.sentences = input_embedding.clone().detach()
        seqback.batch_trees = batch['tree']
        seqback.batch_add_sent = batch['ae_add_sents']
        seqback.start = batch['add_start']
        seqback.end = batch['add_end']
        seqback.adv_sent = []

        batch_tree_embedding = []
        for bi, append_sent in enumerate(batch['ae_add_sents']):
            sentences = [
                torch.tensor(append_sent, dtype=torch.long, device=device)
            ]
            trees = [batch['tree'][bi]]
            tree_embedding = treelstm(sentences, trees)[0][0].detach()
            batch_tree_embedding.append(tree_embedding)

        hidden = torch.cat(batch_tree_embedding, dim=0)
        cw.batch_info = batch

        adv_hidden = cw.run(tree_model,
                            hidden,
                            batch['attack_targets'],
                            batch_size=hidden.shape[0],
                            input_token=input_embedding)
        seqback.adv_sent = []

        adv_seq = torch.tensor(batch['data']).to(device)
        for bi, (add_start, add_end) in enumerate(
                zip(batch['add_start'], batch['add_end'])):
            if bi in cw.o_best_sent:
                ae_words = cw.o_best_sent[bi]
                bert_tokens = tokenizer.convert_tokens_to_ids(ae_words)
                adv_seq[bi, add_start:add_end] = torch.LongTensor(bert_tokens)

        out = model(adv_seq, batch['seq_len'])['pred']
        prediction = torch.max(out, 1)[1]
        orig_correct += batch['orig_correct'].item()
        orig_append_correct += batch['orig_append_correct'].item()
        adv_correct += torch.sum((prediction == batch['label']).float()).item()
        targeted_success += torch.sum(
            (prediction == batch['attack_targets']).float()).item()
        untargeted_success += untargeted_success_rate(prediction,
                                                      batch['label'])
        tot += len(batch['label'])

        for i in range(len(batch['label'])):
            adv_pickle.append({
                'raw_text': transform(adv_seq[i]),
                'label': batch['label'][i].item()
            })
            try:
                logger.info(("orig:", transform(batch['add_sents'][i])))
                logger.info(("adv:", cw.o_best_sent[i]))
            except:
                continue

        logger.info(("orig_correct:", orig_correct))
        logger.info(("orig_append_correct:", orig_append_correct))
        logger.info(("adv_correct:", adv_correct))
        logger.info(("targeted successful rate:", targeted_success))
        logger.info(("untargetd successful rate:", untargeted_success))
        logger.info(("tot:", tot))
        joblib.dump(adv_pickle, root_dir + '/adv_text.pkl')
    logger.info(("orig_correct:", orig_correct / tot))
    logger.info(("orig_append_correct:", orig_append_correct / tot))
    logger.info(("adv_correct:", adv_correct / tot))
    logger.info(("targeted successful rate:", targeted_success / tot))
    logger.info(("untargetd successful rate:", untargeted_success / tot))
Exemple #45
0
    def scrapeConversation(self, merge, offset, timestampOffset, chunkSize,
                           limit, isGroupConversation):
        """Retrieves conversation messages and stores them in a JSON file
        If merge is specified, the new messages will be merged with the previous version of the conversation, if present
        """

        if merge:
            if not os.path.exists(
                    os.path.join(self._directory, "conversation.json")):
                logger.error(
                    "Conversation not present. Merge operation not possible")
                return
            with open(os.path.join(self._directory,
                                   "conversation.json")) as conv:
                convMessages = json.load(conv)
                numMergedMsgs = 0

        if not os.path.exists(self._directory):
            os.makedirs(self._directory)

        logger.info("Starting scraping of conversation {}".format(
            self._convID))

        messages = []
        msgsData = ""
        timestamp = "" if timestampOffset == 0 else str(timestampOffset)
        while self.CONVERSATION_ENDMARK not in msgsData:
            requestChunkSize = chunkSize if limit <= 0 else min(
                chunkSize, limit - len(messages))
            reqData = self.generateRequestData(offset, timestamp,
                                               requestChunkSize,
                                               isGroupConversation)
            logger.info("Retrieving messages {}-{}".format(
                offset, requestChunkSize + offset))
            msgsData = self.executeRequest(reqData)
            jsonData = json.loads(msgsData)

            if jsonData and ('payload' in jsonData) and jsonData['payload']:
                if ('actions' in jsonData['payload']
                    ) and jsonData['payload']['actions']:
                    actions = jsonData['payload']['actions']

                    #case when the last message already present in the conversation
                    #is older newer than the first one of the current retrieved chunk
                    if merge and convMessages[-1]["timestamp"] > actions[0][
                            "timestamp"]:
                        for i, action in enumerate(actions):
                            if convMessages[-1]["timestamp"] == actions[i][
                                    "timestamp"]:
                                numMergedMsgs = len(
                                    actions[i + 1:-1]) + len(messages)
                                messages = convMessages + actions[
                                    i + 1:-1] + messages
                                break
                        break

                    #We retrieve one message two times, as the first one of the previous chunk
                    #and as the last one of the new one. So we here remove the duplicate,
                    #but only once we already retrieved at least one chunk
                    if len(messages) == 0:
                        messages = actions
                    else:
                        messages = actions[:-1] + messages

                    #update timestamp
                    timestamp = str(actions[0]["timestamp"])
                else:
                    if 'errorSummary' in jsonData:
                        logger.error("Response error: " +
                                     jsonData['errorSummary'])
                    else:
                        logger.error("Response error. No messages found")
                        logger.error(msgsData)
                    return
            else:
                logger.error("Response error. Empty data or payload")
                logger.error(msgsData)
                logger.info("Retrying in {} seconds".format(self.ERROR_WAIT))
                time.sleep(self.ERROR_WAIT)
                continue

            offset += chunkSize
            if limit != 0 and len(messages) >= limit:
                break

            time.sleep(self.REQUEST_WAIT)

        if merge:
            logger.info(
                "Successfully merged {} new messages".format(numMergedMsgs))
            logger.info("Conversation total message count = {}".format(
                len(messages)))
        else:
            logger.info(
                "Conversation scraped successfully. {} messages retrieved".
                format(len(messages)))

        self.writeMessages(messages)
Exemple #46
0
    def print_config(self):
        logger.info("=============== global config ===============")
        logger.info("queue length : " + str(self.queue_len))
        logger.info("population size : " + str(self.popsize))
        logger.info("mutate probability : " + str(self.prob_mutate))
        logger.info("the number of process (for multiprocessing) : " +
                    str(self.num_processor))
        logger.info("coverage differential threshold : " +
                    str(self.coverage_threshold))
        logger.info("enable transformation based on filter : " +
                    str(self.enable_filters))
        logger.info("enable optimize : " + str(self.enable_optimize))
        logger.info("robust_threshold : " + str(self.robust_threshold))

        logger.info("=============== translation config ===============")
        logger.info("rotation range : " + str(self.rotation_range))
        logger.info("translate range : " + str(self.translate_range))
        logger.info("shear range : " + str(self.shear_range))
        if self.enable_filters:
            logger.info("zoom range : " + str(self.zoom_range))
            logger.info("blur range : " + str(self.blur_range))
            logger.info("brightness range : " + str(self.brightness_range))
            logger.info("contrast range : " + str(self.contrast_range))
        logger.info("mutate step (for genetic algorithm) : " +
                    str(self.translation_step))

        logger.info("=============== Training Start ===============")
    def run(self):
        '''
            @brief:
                this is the standard function to be called by the
                "multiprocessing.Process"

            @NOTE:
                check the parallel_util.py for definitions
        '''
        self.build_models()

        # load the model if needed
        if self.args.ckpt_name is not None:
            self.restore_all()

        # the main training process
        while True:
            next_task = self.task_q.get()

            # Kill the learner
            if next_task is None or next_task == parallel_util.END_SIGNAL:
                self.task_q.task_done()
                break

            # Get the policy network weights
            elif next_task == parallel_util.START_SIGNAL:
                # just get the params of the network, no learning process
                self.task_q.task_done()
                self.result_q.put(self.get_policy())

            # Updating the network
            else:
                if self.args.test:
                    paths = next_task

                    paths.pop()
                    episoderewards = np.array(
                        [path["rewards"].sum() for path in paths])
                    self.task_q.task_done()
                    stats = {"avg_reward": episoderewards.mean()}
                    logger.info(stats)
                    return_data = {
                        'policy_weights': self.get_policy(),
                        'stats': stats,
                        'totalsteps': self.args.max_timesteps + 100,
                        'iteration': self.get_iteration_count(),
                        'std_reward': episoderewards.std(),
                        "avg_reward": episoderewards.mean(),
                        "max_reward": np.amax(episoderewards),
                        "min_reward": np.amin(episoderewards),
                        "median_reward": np.median(episoderewards),
                    }
                    self.result_q.put(return_data)

                # the actual training step
                else:
                    paths = next_task
                    stats = self.update_parameters(paths)
                    self.task_q.task_done()
                    return_data = {
                        'policy_weights': self.get_policy(),
                        'stats': stats,
                        'totalsteps': self.timesteps_so_far,
                        'iteration': self.get_iteration_count()
                    }
                    self.result_q.put(return_data)
Exemple #48
0
    def model_fn(features, labels, mode, params):
        #### Training or Evaluation
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        total_loss, per_example_loss, logits = function_builder.get_race_loss(
            FLAGS, features, is_training)

        #### Check model parameters
        num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
        logger.info('#params: {}'.format(num_params))

        #### load pretrained models
        scaffold_fn = model_utils.init_from_checkpoint(FLAGS)

        #### Evaluation mode
        if mode == tf.estimator.ModeKeys.EVAL:
            assert FLAGS.num_hosts == 1

            def metric_fn(per_example_loss, label_ids, logits, is_real_example):
                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                eval_input_dict = {
                    'labels': label_ids,
                    'predictions': predictions,
                    'weights': is_real_example
                }
                accuracy = tf.metrics.accuracy(**eval_input_dict)

                loss = tf.metrics.mean(values=per_example_loss,
                                       weights=is_real_example)
                return {
                    'eval_accuracy': accuracy,
                    'eval_loss': loss}

            is_real_example = tf.cast(features["is_real_example"],
                                      dtype=tf.float32)

            #### Constucting evaluation TPUEstimatorSpec with new cache.
            label_ids = tf.reshape(features['label_ids'], [-1])
            metric_args = [per_example_loss, label_ids, logits, is_real_example]

            if FLAGS.use_tpu:
                eval_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    eval_metrics=(metric_fn, metric_args),
                    scaffold_fn=scaffold_fn)
            else:
                eval_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    eval_metric_ops=metric_fn(*metric_args))

            return eval_spec

        #### Configuring the optimizer
        train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss)

        monitor_dict = {}
        monitor_dict["lr"] = learning_rate

        #### Constucting training TPUEstimatorSpec with new cache.
        if FLAGS.use_tpu:
            #### Creating host calls
            host_call = None

            train_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode, loss=total_loss, train_op=train_op,
                host_call=host_call,
                scaffold_fn=scaffold_fn)
        else:
            train_spec = tf.estimator.EstimatorSpec(
                mode=mode, loss=total_loss, train_op=train_op)

        return train_spec
Exemple #49
0
def exec_service_mysqld_start():
    """使用service启动mysql服务"""
    logger.info('尝试使用service启动mysql')
    cmd = ['service', 'mysqld', 'start']
    util.exec.exec_cmd(cmd, exec_start_mysqld.__doc__)
Exemple #50
0
def main(_):
    logger.set_verbosity(logger.INFO)

    #### Validate flags
    if FLAGS.save_steps is not None:
        FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not tf.gfile.Exists(FLAGS.output_dir):
        tf.gfile.MakeDirs(FLAGS.output_dir)

    sp = spm.SentencePieceProcessor()
    sp.Load(FLAGS.spiece_model_file)

    def tokenize_fn(text):
        text = preprocess_text(text, lower=FLAGS.uncased)
        return encode_ids(sp, text)

    # TPU Configuration
    run_config = model_utils.configure_tpu(FLAGS)

    model_fn = get_model_fn()

    spm_basename = os.path.basename(FLAGS.spiece_model_file)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    if FLAGS.use_tpu:
        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size)
    else:
        estimator = tf.estimator.Estimator(
            model_fn=model_fn,
            config=run_config)

    if FLAGS.do_train:
        train_file_base = "{}.len-{}.train.tf_record".format(
            spm_basename, FLAGS.max_seq_length)
        train_file = os.path.join(FLAGS.output_dir, train_file_base)

        if not tf.gfile.Exists(train_file) or FLAGS.overwrite_data:
            train_examples = get_examples(FLAGS.data_dir, "train")
            random.shuffle(train_examples)
            file_based_convert_examples_to_features(
                train_examples, tokenize_fn, train_file)

        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)

    if FLAGS.do_eval:
        eval_examples = get_examples(FLAGS.data_dir, FLAGS.eval_split)
        logger.info("Num of eval samples: {}".format(len(eval_examples)))

        # TPU requires a fixed batch size for all batches, therefore the number
        # of examples must be a multiple of the batch size, or else examples
        # will get dropped. So we pad with fake examples which are ignored
        # later on. These do NOT count towards the metric (all tf.metrics
        # support a per-instance weight, and these get a weight of 0.0).
        #
        # Modified in XL: We also adopt the same mechanism for GPUs.

        while len(eval_examples) % FLAGS.eval_batch_size != 0:
            eval_examples.append(PaddingInputExample())

        eval_file_base = "{}.len-{}.{}.tf_record".format(
            spm_basename, FLAGS.max_seq_length, FLAGS.eval_split)

        if FLAGS.high_only:
            eval_file_base = "high." + eval_file_base
        elif FLAGS.middle_only:
            eval_file_base = "middle." + eval_file_base

        eval_file = os.path.join(FLAGS.output_dir, eval_file_base)
        file_based_convert_examples_to_features(
            eval_examples, tokenize_fn, eval_file)

        assert len(eval_examples) % FLAGS.eval_batch_size == 0
        eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=True)

        ret = estimator.evaluate(
            input_fn=eval_input_fn,
            steps=eval_steps)

        # Log current result
        logger.info("=" * 80)
        log_str = "Eval | "
        for key, val in ret.items():
            log_str += "{} {} | ".format(key, val)
        logger.info(log_str)
        logger.info("=" * 80)
Exemple #51
0
    def build_models(self, build_sampler=True):
        logger.info('Building the text to image GAN model')
        # 1. real image and right text
        with tf.variable_scope(""):
            self.d_network_rr = GAN.img_discriminator(self.config,
                                                      stage=self.stage)
            self.d_network_rr.build_models(self.real_img, self.real_sen_rep)
        self.score_r = self.d_network_rr.get_score()
        self.loss_r = tf.reduce_mean(
            compat_tf.sigmoid_cross_entropy_with_logits(logits=self.score_r,
                                                        labels=tf.ones_like(
                                                            self.score_r)))
        logger.info('loss from real image and right text generated')

        # 2. real image and wrong text
        with tf.variable_scope("", reuse=True):
            self.d_network_rw = GAN.img_discriminator(self.config,
                                                      stage=self.stage)
            self.d_network_rw.build_models(self.real_img, self.wrong_sen_rep)
        self.score_rw = self.d_network_rw.get_score()
        self.loss_w = tf.reduce_mean(
            compat_tf.sigmoid_cross_entropy_with_logits(logits=self.score_rw,
                                                        labels=tf.zeros_like(
                                                            self.score_rw)))
        logger.info('loss from real image and wrong text generated')

        # 3. fake image and right text
        with tf.variable_scope(''):
            self.g_network = GAN.img_generator(self.config, stage=self.stage)
            self.g_network.build_image_generator(self.noise_input,
                                                 self.real_sen_rep)
            self.fake_img = self.g_network.get_fake_image()

        with tf.variable_scope("", reuse=True):
            self.d_network_wr = GAN.img_discriminator(self.config,
                                                      stage=self.stage)
            self.d_network_wr.build_models(self.fake_img, self.real_sen_rep)
        self.fr_score = self.d_network_wr.get_score()
        self.loss_f = tf.reduce_mean(
            compat_tf.sigmoid_cross_entropy_with_logits(logits=self.fr_score,
                                                        labels=tf.zeros_like(
                                                            self.fr_score)))
        logger.info('loss from fake image and right text generated')

        # the loss of generator and the discriminator
        self.loss_d = self.loss_r + self.loss_f + self.loss_w

        self.loss_g = tf.reduce_mean(
            compat_tf.sigmoid_cross_entropy_with_logits(logits=self.fr_score,
                                                        labels=tf.ones_like(
                                                            self.fr_score)))

        # build the sampler
        if build_sampler:
            with tf.variable_scope('', reuse=True):
                self.sample_network = GAN.img_generator(self.config,
                                                        stage='test')
                self.sample_network.build_image_generator(
                    self.noise_input, self.real_sen_rep)
                self.sample_img = self.sample_network.get_fake_image()
        return
Exemple #52
0
 def frameworkMessage(self, driver, executorId, slaveId, message):
     """Echo a framework message."""
     logger.info("Received framework message: %s", message)
Exemple #53
0
def cw_random_word_attack():
    cw = CarliniL2_untargeted_qa(debug=args.debugging)
    criterion = nn.CrossEntropyLoss()
    loss = 0
    adv_loss = 0
    targeted_success = 0
    untargeted_success = 0
    adv_text = []
    answers = dict()
    adv_answers = dict()

    backup_params = EMA(0)
    for name, param in model.named_parameters():
        if param.requires_grad:
            backup_params.register(name, param.data)
            param.data.copy_(ema.get(name))
    tot = 0
    for batch in tqdm(iter(data.dev_iter), total=1000):
        p1, p2 = model(batch)
        orig_answer, orig_s_idx, orig_e_idx = write_to_ans(
            p1, p2, batch, answers)
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        loss += batch_loss.item()

        append_info = append_random_input(batch)
        allow_idxs = append_info['allow_idx']
        batch_start_target = torch.LongTensor([0]).to(device)
        batch_end_target = torch.LongTensor([0]).to(device)

        input_embedding = model.word_emb(batch.c_word[0])
        cw_mask = np.zeros(input_embedding.shape).astype(np.float32)
        cw_mask = torch.from_numpy(cw_mask).float().to(device)

        for bi, allow_idx in enumerate(allow_idxs):
            cw_mask[bi, np.array(allow_idx)] = 1
        cw.wv = model.word_emb.weight
        cw.inputs = batch
        cw.mask = cw_mask
        cw.batch_info = append_info
        cw.num_classes = append_info['tot_length']
        # print(transform(to_list(batch.c_word[0][0])))
        cw.run(model, input_embedding, (batch_start_target, batch_end_target))

        # re-test
        for bi, allow_idx in enumerate(allow_idxs):
            if bi in cw.o_best_sent:
                for i, idx in enumerate(allow_idx):
                    batch.c_word[0].data[bi, idx] = cw.o_best_sent[bi][i]
        p1, p2 = model(batch)
        adv_answer, adv_s_idx, adv_e_idx = write_to_ans(
            p1, p2, batch, adv_answers)
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        adv_loss += batch_loss.item()

        for bi, (start_target, end_target) in enumerate(
                zip(batch_start_target, batch_end_target)):
            start_output = adv_s_idx
            end_output = adv_e_idx
            targeted_success += int(
                compare(start_output, start_target.item(), end_output,
                        end_target.item()))
            untargeted_success += int(
                compare_untargeted(start_output, start_target.item(),
                                   end_output, end_target.item()))
        for i in range(len(allow_idxs)):
            try:
                logger.info(("adv:", transform(cw.o_best_sent[i])))
                adv_text.append({
                    'added_text':
                    transform(cw.o_best_sent[i]),
                    'adv_text':
                    transform(to_list(batch.c_word[0][0])),
                    'qas_id':
                    batch.id[i],
                    'adv_predict': (orig_s_idx, orig_e_idx),
                    'orig_predict': (adv_s_idx, adv_e_idx),
                    'Orig answer:':
                    orig_answer,
                    'Adv answer:':
                    adv_answer
                })
                joblib.dump(adv_text, root_dir + '/adv_text.pkl')
            except:
                adv_text.append({
                    'adv_text':
                    transform(to_list(batch.c_word[0][0])),
                    'qas_id':
                    batch.id[i],
                    'adv_predict': (orig_s_idx, orig_e_idx),
                    'orig_predict': (adv_s_idx, adv_e_idx),
                    'Orig answer:':
                    orig_answer,
                    'Adv answer:':
                    adv_answer
                })
                joblib.dump(adv_text, root_dir + '/adv_text.pkl')
                continue
        # for batch size = 1
        tot += 1
        logger.info(("orig predict", (orig_s_idx, orig_e_idx)))
        logger.info(("adv append predict", (adv_s_idx, adv_e_idx)))
        logger.info(("targeted successful rate:", targeted_success))
        logger.info(("untargetd successful rate:", untargeted_success))
        logger.info(("Orig answer:", orig_answer))
        logger.info(("Adv answer:", adv_answer))
        logger.info(("tot:", tot))

    for name, param in model.named_parameters():
        if param.requires_grad:
            param.data.copy_(backup_params.get(name))

    with open(options.prediction_file, 'w', encoding='utf-8') as f:
        print(json.dumps(answers), file=f)
    with open(options.prediction_file + '_adv.json', 'w',
              encoding='utf-8') as f:
        print(json.dumps(adv_answers), file=f)
    results = evaluate.main(options)
    logger.info(tot)
    logger.info(("adv loss, results['exact_match'], results['f1']", loss,
                 results['exact_match'], results['f1']))
    return loss, results['exact_match'], results['f1']
Exemple #54
0
def queue_segments(jobs, conf, connection):
    """Make a new Mesos job for every segment."""
    try:
        has_enough_segs = False
        for segment in determine_segments(**conf):
            completed_scene_list = []
            segment_length = len(segment)
            if segment_length >= conf.minscenesperseg:
                has_enough_segs = True
                logger.info("Segment length: %d", len(segment))
                logger.info("Segment: %s", segment)
                for scene_record in segment:
                    # Build list to be used in SQL Insert statement
                    row = (scene_record['LANDSAT_PRODUCT_ID'],
                           scene_record['FILE_LOC'])
                    completed_scene_list.append(row)

                    # set 'BLANK' to 'INQUEUE' processing status
                    db.set_scene_to_inqueue(connection,
                                            scene_record['LANDSAT_PRODUCT_ID'])
                logger.info(
                    "Scenes inserted into ARD_PROCESSED_SCENES table:"
                    " %s", completed_scene_list)
                db.processed_scenes(connection, completed_scene_list)

                # WARNING: This assumes subdirectories are desired
                subdirdest = {
                    'LT04': 'tm',
                    'LT05': 'tm',
                    'LE07': 'etm',
                    'LC08': 'oli_tirs'
                }
                final_output = (os.path.join(
                    conf.outdir, "lta_incoming",
                    subdirdest[segment[0]['SATELLITE']], 'ARD_Tile'))

                # Build the Docker entrypoint command.
                cmd = ' '.join([
                    'cli.py', "'" +
                    json.dumps(segment, sort_keys=True, default=str) + "'",
                    final_output
                ])
                job_id = format_job_id(segment)
                logger.debug('Command to clip: [%s]', cmd)

                # Compile the job information.
                job = Job()
                job.cpus = conf.cpus
                job.disk = conf.disk
                job.mem = conf.memory
                job.command = cmd
                job.job_id = job_id
                jobs.append(job)
                logger.info('Queuing job id: %s', job_id)

        if not has_enough_segs:
            logger.info("No segments meet the %d scenes per segment minimum",
                        conf.minscenesperseg)

        return SUCCESS

    except Exception:
        logger.exception('Unable to fetch segments!')
        return ERROR
Exemple #55
0
 def shutdown(self, signum, frame):
     """Call for signal interrupts."""
     self.flag = True
     logger.info("Shutdown requested.")
Exemple #56
0
def cw_rand_words_attack(data_val):
    adv_correct = 0
    targeted_success = 0
    untargeted_success = 0
    orig_correct = 0
    orig_append_correct = 0
    tot = 0
    adv_pickle = []

    cw = CarliniL2_random(debug=args.debugging)
    for batch in get_random_word_batch(data_val):
        data = batch['data']
        seq_len = batch['seq_len']
        label = batch['label']
        batch_add_start = batch['add_start']
        batch_add_end = batch['add_end']
        attack_targets = batch['attack_targets']
        add_sents = batch['add_sents']
        allow_idxs = batch['allow_idx']
        tot += len(label)

        input_embedding = model.bert.embeddings.word_embeddings(data)
        cw_mask = np.zeros(input_embedding.shape).astype(np.float32)
        for bi, allow_idx in enumerate(allow_idxs):
            cw_mask[bi, np.array(allow_idx)] = 1
        cw_mask = torch.from_numpy(cw_mask).float().to(device)
        cw.wv = model.bert.embeddings.word_embeddings.weight
        cw.mask = cw_mask
        cw.seq = data
        cw.batch_info = batch
        cw.seq_len = seq_len
        adv_data = cw.run(model, input_embedding, attack_targets)

        adv_seq = torch.tensor(batch['data']).to(device)
        for bi, allow_idx in enumerate(allow_idxs):
            if bi in cw.o_best_sent:
                for i, idx in enumerate(allow_idx):
                    adv_seq.data[bi, idx] = cw.o_best_sent[bi][i]
        out = model(adv_seq, seq_len)['pred']
        prediction = torch.max(out, 1)[1]
        orig_correct += batch['orig_correct'].item()
        orig_append_correct += batch['orig_append_correct'].item()
        adv_correct += torch.sum((prediction == label).float()).item()
        targeted_success += torch.sum(
            (prediction == attack_targets).float()).item()
        untargeted_success += untargeted_success_rate(prediction, label)

        for i in range(len(adv_seq)):
            adv_pickle.append({
                'raw_text': transform(adv_seq[i]),
                'label': label[i].item()
            })
            try:
                # logger.info(("orig:", transform(add_sents[i][1:])))
                logger.info(("adv:", transform(cw.o_best_sent[i])))
            except:
                continue

        logger.info(("orig_correct:", orig_correct))
        logger.info(("orig_append_correct:", orig_append_correct))
        logger.info(("adv_correct:", adv_correct))
        logger.info(("targeted successful rate:", targeted_success))
        logger.info(("untargetd successful rate:", untargeted_success))
        logger.info(("tot:", tot))
        joblib.dump(adv_pickle, root_dir + '/adv_text.pkl')
    logger.info(("orig_correct:", orig_correct / tot))
    logger.info(("orig_append_correct:", orig_append_correct / tot))
    logger.info(("adv_correct:", adv_correct / tot))
    logger.info(("targeted successful rate:", targeted_success / tot))
    logger.info(("untargetd successful rate:", untargeted_success / tot))
Exemple #57
0
 def start(self):
     logger.info('正在监听仓库 - {user}/{repo} 更新'.format(user=self.__user,
                                                    repo=self.__repo))
     super().start()
Exemple #58
0
def run_forever(conf):
    """Entrypoint to keep the framework running until terminated."""
    logger.info('******************Start************')
    logger.debug('DB connection: %s', conf.l2_db_con)
    logger.debug("Minimum Senes Per Seg: %s", conf.minscenesperseg)
    logger.debug('Segment query: %s', conf.segment_query)

    global shutdown
    db.reset_records(db.connect(conf.l2_db_con))

    # Establish framework, executor, and authentication credentials
    framework = mesos_pb2.FrameworkInfo()
    framework.user = conf.framework_user
    framework.name = "ARD Tile Framework"
    framework.principal = conf.mesos_principal
    framework.role = conf.mesos_role

    executor = mesos_pb2.ExecutorInfo()
    executor.executor_id.value = "default"
    executor.name = "ARD Tile executor"

    implicit_acks = 1
    scheduler = ArdTileScheduler(implicit_acks, executor, conf)

    if not conf.disable_creds:
        logger.info("             MESOS creds ENABLED")

        credential = mesos_pb2.Credential()
        credential.principal = conf.mesos_principal
        credential.secret = conf.mesos_secret
        driver = mesos.native.MesosSchedulerDriver(scheduler, framework,
                                                   conf.master, implicit_acks,
                                                   credential)
    else:
        logger.info("             MESOS creds disabled")
        driver = mesos.native.MesosSchedulerDriver(scheduler, framework,
                                                   conf.master, implicit_acks)

    shutdown = Shutdown()

    def run_driver_async():
        """Thread for async communication with Mesos offers."""
        # driver.run() blocks, so run it in a separate thread.
        status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1
        driver.stop()
        sys.exit(status)

    framework_thread = Thread(target=run_driver_async, args=())
    framework_thread.start()

    while framework_thread.is_alive():
        # If a shutdown has been requested, suppress offers and wait for the
        # framework thread to complete.
        if shutdown.flag:
            logger.info("Shutdown requested....")
            driver.suppressOffers()
            while framework_thread.is_alive():
                logger.debug("Thread alive, sleep 5....")
                time.sleep(5)
            break

        # If the job queue is empty, get work.
        if (not scheduler.jobs
                and queue_segments(scheduler.jobs, conf,
                                   db.connect(conf.l2_db_con)) == ERROR):
            driver.stop(True)
            sys.exit(1)

        # If there's no new work to be done or the max number of jobs are
        # already running, suppress offers and wait for some jobs to finish.
        if (not scheduler.jobs or not scheduler.scheduling_allowed()):
            logger.info("No jobs or scheduling not allowed....")
            driver.suppressOffers()
            while not scheduler.scheduling_allowed():
                logger.debug("Scheduling not alive, sleep 20....")
                time.sleep(20)
            while not scheduler.jobs:
                if queue_segments(scheduler.jobs, conf,
                                  db.connect(conf.l2_db_con)) == ERROR:
                    driver.stop(True)
                    sys.exit(1)
                time.sleep(20)

            driver.reviveOffers()
Exemple #59
0
def run():
    logger.info("using device: {}".format(config.DEVICE))
    train_data = process_raw_data()
    train_list, test_list = train_test_split(train_data,
                                             test_size=0.2,
                                             random_state=34)

    # 加载GPT2模型
    model, n_ctx = create_model(True)
    model.to(config.DEVICE)
    # 是否使用多块GPU进行并行运算: 可以选择要使用哪几块显卡来进行训练
    multi_gpu = False
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        logger.info("Using more than one GPUs to train...")
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = config.DEVICE_NUM
        model = DataParallel(
            model, device_ids=[int(i) for i in config.DEVICE_NUM.split(",")])
        multi_gpu = True

    # 记录模型参数数量
    num_parameters = sum(
        [parameter.numel() for parameter in model.parameters()])
    logger.info("number of model parameters: {}".format(num_parameters))

    # 加载数据
    logger.info("loading training data")
    train_dataset = DialogueDataset(train_list, n_ctx)
    batch_num = len(train_dataset) // config.BATCH_SIZE
    test_dataset = DialogueDataset(test_list, n_ctx)
    test_batch_num = len(test_dataset) // config.BATCH_SIZE

    train_data_loader = DataLoader(train_dataset,
                                   batch_size=config.BATCH_SIZE,
                                   shuffle=True,
                                   num_workers=4,
                                   collate_fn=collate_fn)

    test_data_loader = DataLoader(test_dataset,
                                  batch_size=config.BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=1,
                                  collate_fn=collate_fn)

    # 计算所有epoch进行参数优化的总步数total_steps
    total_steps = int(
        len(train_data_loader) * config.EPOCHS / config.BATCH_SIZE /
        config.GRADIENT_ACCUMULATION)
    logger.info('total training steps = {}'.format(total_steps))

    # 设置优化器,并且在初始训练时,使用warmup策略
    optimizer = AdamW(model.parameters(),
                      lr=config.LEARNING_RATE,
                      correct_bias=True)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.WARM_STEPS,
        num_training_steps=total_steps)

    logger.info("start training...")
    best_loss = 100
    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        train_fn(model, train_data_loader, optimizer, scheduler, epoch,
                 batch_num, multi_gpu)
        loss, accuracy = eval_fn(model, test_data_loader, test_batch_num,
                                 multi_gpu)
        if loss < best_loss or accuracy > best_accuracy:
            logger.info('saving model for epoch {}, best loss: {}'.format(
                epoch + 1, loss))
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(config.MODEL_PATH)
            best_loss = loss
            best_accuracy = accuracy
Exemple #60
0
    parser.add_argument(action="store",
                        dest='output_path',
                        type=str,
                        metavar='PATH')
    parser.add_argument('-c',
                        '--config',
                        action="store",
                        dest='config_file',
                        default='/ARD_Clip.conf',
                        required=False,
                        type=str,
                        metavar='PATH')
    return vars(parser.parse_args())


if __name__ == '__main__':
    args = parse_cli()
    conf = config.read_config(args['config_file'])
    setup_logger(level='debug' if conf.debug else 'info')

    logger.info('******************Start************')
    logger.info('             DB connection: %s', conf.connstr)
    logger.info("             Version: %s", conf.version)
    logger.info("             Debug: %s", conf.debug)
    logger.info('segment: %s', args['segment'])
    logger.info('output path: %s', args['output_path'])

    process_segments(args['segment'], args['output_path'], conf)

    logger.info('..................Normal End............')