Ejemplo n.º 1
0
def sim_word_to_all(vectorspath, pivotwordvectors, identity, simfunction):
    filepath=target_dir_path+identity

    fr=open(vectorspath)

    pivotwords=pivotwordvectors.keys()

    keys=[]
    vectors=[]
    next(fr)
    cnt=0
    logger.info('reading vectors in '+vectorspath)
    for line in fr:
        cnt+=1
        line=line.strip()
        key=line[0:line.index(' ')]
        vector=map(float, line[line.index(' '):].strip().split())

        vectors.append(vector)
        keys.append(key)

        if cnt%100000==0:
            logger.info('reading '+str(cnt)+' vectors of '+vectorspath)

    fr.close()

    wordspath=vectorspath+'.wordspkl'
    if not os.path.isfile(wordspath):
        with open(wordspath, 'wb') as handle:
            pickle.dump(keys, handle)

    for pivotword in pivotwords:
        logger.info('multipying '+pivotword+' to '+str(len(vectors))+' vectors')
        pivotvector=np.array(map(float,pivotwordvectors[pivotword].split()))
        simres=np.dot(np.array(vectors),pivotvector.T)
        simresstr=' '.join(map(str,simres))

        tools._mkdir_recursive(target_dir_path+'/'+pivotword+'/')
        dest_path=target_dir_path+'/'+pivotword+'/'+identity+'.json'
        logger.info('saving '+pivotword+' to '+dest_path)
        wr=open(dest_path,'w')
        wr.write(json.dumps(simresstr))
        wr.close()
Ejemplo n.º 2
0
    logger.info('Starting...')

    argv=sys.argv[1:]

    simfunction=argv[0]
    outputcode=argv[1]
    simwordtype=argv[2]

    if simwordtype=='orig':
        simword_dir_path='/data/nrekabsaz/similarity/result/randomness/'+simfunction+'/simword/'
        target_dir_path='/data/nrekabsaz/similarity/result/randomness/'+simfunction+'/simwordspan/' 
    elif values_ready[1]=='norm':
        simword_dir_path='/data/nrekabsaz/similarity/result/randomness/'+simfunction+'/simwordnorm/'
        target_dir_path='/data/nrekabsaz/similarity/result/randomness/'+simfunction+'/simwordspannorm/'

    tools._mkdir_recursive(target_dir_path)

    try:
        logger.info('simwordspan_calc ('+str(outputcode)+') '+simfunction+' started...')

        simwordspan_calc(outputcode)

        msgText='simwordspan_calc ('+str(outputcode)+') '+simfunction+' finished!'

        logger.info(msgText)
    except Exception as e:
        msgText='run_simwordspan_calc('+str(outputcode)+') '+simfunction+'\n'+traceback.format_exc()
        logger.error(msgText)
        mail.sendemail_error(msgText)
        logger.info('email sent!')
def extended_distribution(pivotword_i):


    pivotword=pivotwords[pivotword_i]
    logphrase=identity+'/'+pivotword

    try:
        logger.info(logphrase+' : starting')
        target_path='/data/nrekabsaz/experiments/randomness/'+simfunction+'/extendeddist/'+pivotword+'/'+identity+'.json'
        tools._mkdir_recursive(os.path.dirname(target_path))
        simword_path='/data/nrekabsaz/experiments/randomness/'+simfunction+'/simword/'+pivotword+'/'+identity+'.json'
        logphrase=identity+'/'+pivotword


        logger.info(logphrase+' : '+'loading' + simword_path)
        with open(simword_path) as frr:
            data=json.load(frr)
        meanstd=data.values()

        logger.info(logphrase+' : '+'calculating meanstdtopdown')
        meanstdtopdown=[]
        for meanstd_tuple in meanstd:
            mean=meanstd_tuple[0]
            std=meanstd_tuple[1]
            topbrd=mean+3*std
            downbrd=mean-3*std
            meanstdtopdown.append((mean, std, topbrd, downbrd))
        meanstdtopdown.sort(key=lambda x: x[2], reverse=True)#sort by topbrd
        meanstd=None

        logger.info(logphrase+' : '+'calculating mixture norms')
        bins = np.arange(1, -0.2, -.001)
        first_dropin_slide_cnt=0
        mixpdflist=[]
        for bin_i, bin in enumerate(bins):
            mixpdf=0
            if first_dropin_slide_cnt!=-1:
                meanstdtopdown_cnt=first_dropin_slide_cnt
            first_dropin_slide_cnt=-1

            while (True):
                if meanstdtopdown_cnt>=len(meanstdtopdown):
                    break

                meanstdtopdown_tuple=meanstdtopdown[meanstdtopdown_cnt]
                if meanstdtopdown_tuple[2]>=bin:
                    if meanstdtopdown_tuple[3]<=bin:
                        if first_dropin_slide_cnt==-1:
                            first_dropin_slide_cnt=meanstdtopdown_cnt
                        if meanstdtopdown_tuple[1]!=0:
                            #normdist=norm(loc = meanstdtopdown_tuple[0], scale = meanstdtopdown_tuple[1])
                            #normdist=normdistlist[meanstdtopdown_cnt]
                            if bin_i+1<len(bins):
                                mixpdf+=(norm.cdf(bins[bin_i+1], loc = meanstdtopdown_tuple[0], scale = meanstdtopdown_tuple[1])-
                                         norm.cdf(bins[bin_i], loc = meanstdtopdown_tuple[0], scale = meanstdtopdown_tuple[1]))
                            else:
                                mixpdf+=norm.cdf(bins[bin_i], loc = meanstdtopdown_tuple[0], scale = meanstdtopdown_tuple[1])
                        else:
                            mixpdf+=1.0

                    meanstdtopdown_cnt+=1
                else:
                    break
            mixpdflist.append(mixpdf)

            if bin_i%50==0:
                logger.info(logphrase+' : calculating mixture value '+str(bin_i)+'/'+str(len(bins)))

        logger.info(logphrase+' : saving into file')
        wr=open(target_path,'w')
        wr.write(' '.join(map(str, mixpdflist)))
        wr.close()

    except KeyboardInterrupt:
        raise KeyboardInterruptError()
    except Exception as e:
        msgText=identity+' : extended_distribution('+simfunction+') error!'+'\n'+traceback.format_exc()
        mail.sendemail_error(msgText)
        logger.error(msgText)
    #parameters
    argv=sys.argv[1:]

    simfunction=argv[0]
    identity=argv[1]
    thread_no=int(argv[2])

    #paths
    rootpath='/'.join(os.path.dirname(os.path.realpath(__file__)).split('/')[0:3])+'/'

    #logger
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    logger = logging.getLogger('simword')
    logger_file_path=rootpath+'mycode/research_n_analyse/log/randomness/'+datetime.now().strftime('cosine_%H_%M_%d_%m_%Y.log')
    tools._mkdir_recursive(os.path.dirname(logger_file_path))
    file_hdlr = logging.FileHandler(logger_file_path)
    file_hdlr.setFormatter(formatter)
    logger.addHandler(file_hdlr)
    logger.setLevel(logging.DEBUG)
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.DEBUG)
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

    #start
    begin_time=datetime.now()
    logger.info('Starting...')

    #get pivotwords
    pivotwordvectors=json.load(open('vectors/vector200-1.txt.norm'))