def generateModelsForPeriod(years, minSentences, maxSentences, nSteps, chunkSize, randomSeed, vector_size, modelFolder, fixedAlpha=False): '''Generates sets of w2v models trained using randomly selected sentences from the given year period. Models are trained with an increasing number of sentences, starting with minSentences and going up to maxSentences (in nSteps, so nSteps models are generated). randomSeed is used to initialize the random number generator. vector_size defines the number of dimensions used by the w2v models. modelFolder indicates the directory where the models will be saved. fixedAlpha can be set to True to use a fixed learning rate for w2v. Files generated in modelFolder: - nSentences.pkl -- list containing the number of sentences to be used in each batch. - year_nSentences.w2v -- the models (e.g. 1990_000100000000.w2v) (and corresponding vocab) ''' # Build chunks to be used (if needed) checkPath(modelFolder) for year in years: buildChunks(year, chunkSize) # Initialize loader loader = RandomSentenceLoader(years, chunkSize=chunkSize, seed=randomSeed) # The number of sentences to be used in each batch. batchSizes = np.logspace(np.log10(minSentences), np.log10( maxSentences), num=nSteps, dtype=int) pkl.dump(batchSizes, open(modelFolder + '/nSentences.pkl', 'w')) for batchSize in batchSizes: cumSentencesStr = ('%d' % batchSize).zfill(12) modelKey = '%d_' % years[0] + cumSentencesStr modelName = modelFolder + '/%s.w2v' % (modelKey) vocabName = modelName.replace('.w2v', '.vocab.w2v') print 'Building model: ', modelName # Initialize w2v model vocabSizeMB = 1000 * 1024 * 1024 if fixedAlpha: print '...using fixed alpha...' model = gensim.models.Word2Vec( max_vocab_size=vocabSizeMB, seed=randomSeed, size=vector_size, alpha=0.75, min_alpha=0.75) else: model = gensim.models.Word2Vec( max_vocab_size=vocabSizeMB, seed=randomSeed, size=vector_size) # build vocabulary, then train the model with same sentences. # We use a generator to avoid loading all sentences at once, but that # means in order to use the same set of sentences, we reset loader. loader.reset() batch = loader.nextBatchGenerator(batchSize=batchSize) model.build_vocab(batch) loader.reset() batch = loader.nextBatchGenerator(batchSize=batchSize) model.train(batch) print '...saving' model.wv.save_word2vec_format(modelName, fvocab=vocabName, binary=True)
def _getDocumentsForYear(year): """Retrieves the content of all documents for a year specified. Also caches the document for later use (if required). This is meant to speed document processing time, as document preparation requires significant time.""" cachedFile = _getCachedName(year) checkPath(cacheDir) print 'Loading cached file: ' + cachedFile if os.path.exists(cachedFile): with gzip.open(cachedFile, 'rb') as f: documentsForYear = pkl.load(f) else: docs = _listDocsForYear(year) documentsForYear = [] for doc in docs: articles = _getArticlesInDoc(doc) sentences = [ _getSentencesInArticle(article) for article in articles ] sentences = [_prepareSentences(sentence) for sentence in sentences] documentsForYear += sentences print 'Saving to cache %s...' % cachedFile with gzip.open(cachedFile, 'wb') as f: pkl.dump(documentsForYear, f) return documentsForYear
def generate(size, dir, path, outputName, params): currentTime = datetime.datetime.now() if len(outputName) <= 0: batchName = currentTime.strftime("%b-%d-%Y-%H%M%p%s") else: batchName = util.checkPath(dir, outputName) currentDirectory = os.getcwd() childDirectory = dir + "/" + batchName childDirectory = os.mkdir(childDirectory) record = [['email', 'gender', 'school', 'date']] ID = 1 progressBarStep = int(size / 10) progress = QProgressDialog("Generating Resumes", "Close", 0, 100) progress.setWindowModality(QtCore.Qt.WindowModal) progress.setAutoClose(True) progress.show() while ID <= size: pair = getPair(path, params) record.append(makeDoc(pair, 'control', ID, batchName, params)) record.append(makeDoc(pair, 'test', ID, batchName, params)) ID += 1 progress.setValue(ID) message = QMessageBox() message.setText("Resume generation complete.") message.exec() del pair return record
def measureConvergence(y0, nYears, saveDir): """ Call computeConvergenceOverYearRange for sentences in the given year range and save convergence results to the given directory. """ batchSize = 1e6 maxSentences = 10e7 yN = y0 + nYears allYears = getYears() yearRange = allYears[(y0 <= allYears) & (allYears <= yN)] convergence, sentenceYearCounter, vocabSize = computeConvergenceOverYearRange( yearRange, batchSize, maxSentences) checkPath(saveDir) fname = saveDir + 'convergenceRange_%d-%d.pkl' % (y0, yN) pkl.dump((convergence, sentenceYearCounter, vocabSize), open(fname, 'wb'))
def getMoods(self, qqnumber): #下载动态文件并保存到/mood_result/qqnumber文件夹下 referer = 'http://user.qzone.qq.com/' + qqnumber self.headers['Referer'] = referer # 创建qqnumber夹用于保存文件 util.checkPath('mood_result/' + qqnumber) # 获取目标url urlBase = util.parseMoodsUrl(qqnumber) pos = 0 key = True while key: print("\tDealing with position:\t%d" % pos) url = urlBase + "&pos=%d" % pos # print(url) 用于调试 res = self.session.get(url, headers = self.headers) con = res.text with open('mood_result/' + qqnumber + '/' + str(pos), 'w', encoding="utf-8") as f: f.write(con) if '''"msglist":null''' in con: key = False # 禁止访问 if '''"对不起,主人设置了保密,您没有权限查看"''' in con: with open('naida_log.log', 'a', encoding="utf-8") as logFile: logFile.write("%s No access..\n" % qqnumber) self.access[qqnumber] = 'False' key = False else: self.access[qqnumber] = 'True' # Cookie 过期 if '''"subcode":-4001''' in con: with open('naida_log.log', 'a', encoding="utf-8") as log_file: log_file.write('Cookie expired! Time is %s\n' % time.ctime()) sys.exit() pos += 20 time.sleep(5) with open('Access.json', 'w', encoding = 'utf-8') as fp: fp.write((json.dumps(self.access, ensure_ascii = False, indent = 4)))
def getMoodsStart(self): app = Moods() with open('qqnumber.inc', encoding="utf-8") as qnumberFile: qnumberString = qnumberFile.read() qnumberList = eval(qnumberString) # 检查文件夹是否存在 # 不存在则创建 util.checkPath('mood_result') self.sem = threading.Semaphore(10) while qnumberList != []: saveBackNumber = qnumberList[:] item = qnumberList.pop() self.sem.acquire() t = threading.Thread(target=self.getMooding, args=(item, app, saveBackNumber)) t.start() time.sleep(1) else: print("Finish All!")
def generateModels(y0, yN, yearsInModel, stepYears, modelFolder): """Generate time shifting w2v models on the given time range (y0 - yN). Each model contains the specified number of years (yearsInModel). The start year of each new model is set to be stepYears after the previous model. Resulting models are saved on modelFolder. """ checkPath(modelFolder) for year in range(y0, yN - yearsInModel + 1, stepYears): startY = year endY = year + yearsInModel modelName = modelFolder + '/%d_%d.w2v' % (year, year + yearsInModel) vocabName = modelName.replace('.w2v', '.vocab.w2v') print 'Building model: ', modelName sentences = getSentencesInRange(startY, endY) model = gensim.models.Word2Vec(min_count=1) model.build_vocab(sentences) model.train(sentences) print '...saving' model.init_sims(replace=True) model.wv.save_word2vec_format(modelName, fvocab=vocabName, binary=True)
def __init__(self): self.headers = util.headers self.baseUrl = util.parseFriendsUrl() util.checkPath('friends') print('Start get friends file')
def getAccess(self): util.checkPath('Access.json') with open('Access.json', 'r', encoding='utf-8') as f: fstring = f.read() fjson = eval(fstring) return fjson