Ejemplo n.º 1
0
def generateModelsForPeriod(years, minSentences, maxSentences, nSteps, chunkSize,
                            randomSeed, vector_size, modelFolder, fixedAlpha=False):
    '''Generates sets of w2v models trained using randomly selected sentences
    from the given year period. Models are trained with an increasing number of
    sentences, starting with minSentences and going up to maxSentences (in nSteps,
    so nSteps models are generated).

    randomSeed is used to initialize the random number generator.
    vector_size defines the number of dimensions used by the w2v models.
    modelFolder indicates the directory where the models will be saved.
    fixedAlpha can be set to True to use a fixed learning rate for w2v.

    Files generated in modelFolder:
     - nSentences.pkl -- list containing the number of sentences to be used in each batch.
     - year_nSentences.w2v -- the models (e.g. 1990_000100000000.w2v) (and corresponding vocab)
    '''
    # Build chunks to be used (if needed)
    checkPath(modelFolder)
    for year in years:
        buildChunks(year, chunkSize)

    # Initialize loader
    loader = RandomSentenceLoader(years, chunkSize=chunkSize, seed=randomSeed)

    # The number of sentences to be used in each batch.
    batchSizes = np.logspace(np.log10(minSentences), np.log10(
        maxSentences), num=nSteps, dtype=int)
    pkl.dump(batchSizes, open(modelFolder + '/nSentences.pkl', 'w'))

    for batchSize in batchSizes:
        cumSentencesStr = ('%d' % batchSize).zfill(12)
        modelKey = '%d_' % years[0] + cumSentencesStr

        modelName = modelFolder + '/%s.w2v' % (modelKey)
        vocabName = modelName.replace('.w2v', '.vocab.w2v')
        print 'Building model: ', modelName

        # Initialize w2v model
        vocabSizeMB = 1000 * 1024 * 1024
        if fixedAlpha:
            print '...using fixed alpha...'
            model = gensim.models.Word2Vec(
                max_vocab_size=vocabSizeMB, seed=randomSeed, size=vector_size, alpha=0.75, min_alpha=0.75)
        else:
            model = gensim.models.Word2Vec(
                max_vocab_size=vocabSizeMB, seed=randomSeed, size=vector_size)

        # build vocabulary, then train the model with same sentences.
        # We use a generator to avoid loading all sentences at once, but that
        # means in order to use the same set of sentences, we reset loader.
        loader.reset()
        batch = loader.nextBatchGenerator(batchSize=batchSize)
        model.build_vocab(batch)

        loader.reset()
        batch = loader.nextBatchGenerator(batchSize=batchSize)
        model.train(batch)

        print '...saving'
        model.wv.save_word2vec_format(modelName, fvocab=vocabName, binary=True)
Ejemplo n.º 2
0
def _getDocumentsForYear(year):
    """Retrieves the content of all documents for a year specified. Also caches
    the document for later use (if required). This is meant to speed document
    processing time, as document preparation requires significant time."""
    cachedFile = _getCachedName(year)

    checkPath(cacheDir)

    print 'Loading cached file: ' + cachedFile
    if os.path.exists(cachedFile):
        with gzip.open(cachedFile, 'rb') as f:
            documentsForYear = pkl.load(f)
    else:
        docs = _listDocsForYear(year)
        documentsForYear = []
        for doc in docs:
            articles = _getArticlesInDoc(doc)
            sentences = [
                _getSentencesInArticle(article) for article in articles
            ]
            sentences = [_prepareSentences(sentence) for sentence in sentences]
            documentsForYear += sentences
        print 'Saving to cache %s...' % cachedFile

        with gzip.open(cachedFile, 'wb') as f:
            pkl.dump(documentsForYear, f)
    return documentsForYear
Ejemplo n.º 3
0
def generate(size, dir, path, outputName, params):
    currentTime = datetime.datetime.now()
    if len(outputName) <= 0:
        batchName = currentTime.strftime("%b-%d-%Y-%H%M%p%s")
    else:
        batchName = util.checkPath(dir, outputName)

    currentDirectory = os.getcwd()
    childDirectory = dir + "/" + batchName
    childDirectory = os.mkdir(childDirectory)
    record = [['email', 'gender', 'school', 'date']]
    ID = 1
    progressBarStep = int(size / 10)
    progress = QProgressDialog("Generating Resumes", "Close", 0, 100)
    progress.setWindowModality(QtCore.Qt.WindowModal)
    progress.setAutoClose(True)
    progress.show()

    while ID <= size:
        pair = getPair(path, params)
        record.append(makeDoc(pair, 'control', ID, batchName, params))
        record.append(makeDoc(pair, 'test', ID, batchName, params))
        ID += 1
        progress.setValue(ID)

    message = QMessageBox()
    message.setText("Resume generation complete.")
    message.exec()
    del pair
    return record
Ejemplo n.º 4
0
def measureConvergence(y0, nYears, saveDir):
    """
    Call computeConvergenceOverYearRange for sentences in the given year range
    and save convergence results to the given directory.
    """
    batchSize = 1e6
    maxSentences = 10e7
    yN = y0 + nYears
    allYears = getYears()
    yearRange = allYears[(y0 <= allYears) & (allYears <= yN)]

    convergence, sentenceYearCounter, vocabSize = computeConvergenceOverYearRange(
        yearRange, batchSize, maxSentences)

    checkPath(saveDir)
    fname = saveDir + 'convergenceRange_%d-%d.pkl' % (y0, yN)
    pkl.dump((convergence, sentenceYearCounter, vocabSize), open(fname, 'wb'))
Ejemplo n.º 5
0
    def getMoods(self, qqnumber):
        #下载动态文件并保存到/mood_result/qqnumber文件夹下

        referer = 'http://user.qzone.qq.com/' + qqnumber
        self.headers['Referer'] = referer

        # 创建qqnumber夹用于保存文件
        util.checkPath('mood_result/' + qqnumber)

        # 获取目标url
        urlBase = util.parseMoodsUrl(qqnumber)

        pos = 0
        key = True

        while key:
            print("\tDealing with position:\t%d" % pos)
            url = urlBase + "&pos=%d" % pos
            # print(url) 用于调试
            res = self.session.get(url, headers = self.headers)
            con = res.text
            with open('mood_result/' + qqnumber + '/' + str(pos), 'w', encoding="utf-8") as f:
                f.write(con)

            if '''"msglist":null''' in con:
                key = False

            # 禁止访问
            if '''"对不起,主人设置了保密,您没有权限查看"''' in con:
                with open('naida_log.log', 'a', encoding="utf-8") as logFile:
                    logFile.write("%s No access..\n" % qqnumber)
                self.access[qqnumber] = 'False'
                key = False
            else:
                self.access[qqnumber] = 'True'

            # Cookie 过期
            if '''"subcode":-4001''' in con:
                with open('naida_log.log', 'a', encoding="utf-8") as log_file:
                    log_file.write('Cookie expired! Time is %s\n' % time.ctime())
                sys.exit()

            pos += 20
            time.sleep(5)
        with open('Access.json', 'w', encoding = 'utf-8') as fp:
            fp.write((json.dumps(self.access, ensure_ascii = False, indent = 4)))
Ejemplo n.º 6
0
    def getMoodsStart(self):
        app = Moods()

        with open('qqnumber.inc', encoding="utf-8") as qnumberFile:
            qnumberString = qnumberFile.read()
        qnumberList = eval(qnumberString)

        # 检查文件夹是否存在
        # 不存在则创建
        util.checkPath('mood_result')
        self.sem = threading.Semaphore(10)
        while qnumberList != []:
            saveBackNumber = qnumberList[:]
            item = qnumberList.pop()

            self.sem.acquire()
            t = threading.Thread(target=self.getMooding, args=(item, app, saveBackNumber))
            t.start()
            time.sleep(1)
            
        else:
            print("Finish All!")
Ejemplo n.º 7
0
def generateModels(y0, yN, yearsInModel, stepYears, modelFolder):
    """Generate time shifting w2v models on the given time range (y0 - yN).
    Each model contains the specified number of years (yearsInModel). The start
    year of each new model is set to be stepYears after the previous model.
    Resulting models are saved on modelFolder.
    """
    checkPath(modelFolder)

    for year in range(y0, yN - yearsInModel + 1, stepYears):
        startY = year
        endY = year + yearsInModel
        modelName = modelFolder + '/%d_%d.w2v' % (year, year + yearsInModel)
        vocabName = modelName.replace('.w2v', '.vocab.w2v')
        print 'Building model: ', modelName

        sentences = getSentencesInRange(startY, endY)
        model = gensim.models.Word2Vec(min_count=1)
        model.build_vocab(sentences)
        model.train(sentences)

        print '...saving'
        model.init_sims(replace=True)
        model.wv.save_word2vec_format(modelName, fvocab=vocabName, binary=True)
Ejemplo n.º 8
0
 def __init__(self):
     self.headers = util.headers
     self.baseUrl = util.parseFriendsUrl()
     util.checkPath('friends')
     print('Start get friends file')
Ejemplo n.º 9
0
 def getAccess(self):
     util.checkPath('Access.json')
     with open('Access.json', 'r', encoding='utf-8') as f:
         fstring = f.read()
     fjson = eval(fstring)
     return fjson