def create_schema(data_directory, raw_data_location, threads=1, force=False): project = HypergolProject(dataDirectory=data_directory, force=force) rawMetadata = project.datasetFactory.get(dataType=RawMetadata, name='raw_metadata', chunkCount=256) rawData = project.datasetFactory.get(dataType=RawData, name='raw_data', chunkCount=256) createRawMetadata = CreateRawMetadata(rawDataLocation=raw_data_location, splits=threads, outputDataset=rawMetadata) createRawData = CreateRawData(rawDataLocation=raw_data_location, plurals=PLURALS, inputDatasets=[rawMetadata], outputDataset=rawData, debug=True) pipeline = Pipeline(tasks=[ createRawMetadata, createRawData, ]) pipeline.run(threads=threads)
def process_blogposts(threads=1, force=False, onlyTasks=None): project = HypergolProject(dataDirectory='.', force=force) articles = project.datasetFactory.get(dataType=Article, name='articles') articleTexts = project.datasetFactory.get(dataType=ArticleText, name='article_texts') articlePages = project.datasetFactory.get(dataType=ArticlePage, name='article_pages') sentences = project.datasetFactory.get(dataType=Sentence, name='sentences') loadHtmlPagesTask = LoadHtmlPagesTask( inputDatasets=[exampleInputDataset1, exampleInputDataset2], outputDataset=exampleOutputDataset, ) createArticleTextsTask = CreateArticleTextsTask( inputDatasets=[exampleInputDataset1, exampleInputDataset2], outputDataset=exampleOutputDataset, ) createArticlesTask = CreateArticlesTask( inputDatasets=[exampleInputDataset1, exampleInputDataset2], outputDataset=exampleOutputDataset, ) createSentencesTask = CreateSentencesTask( inputDatasets=[exampleInputDataset1, exampleInputDataset2], outputDataset=exampleOutputDataset, ) pipeline = Pipeline(tasks=[ loadHtmlPagesTask, createArticleTextsTask, createArticlesTask, createSentencesTask, ]) pipeline.run(threads=threads, onlyTasks=onlyTasks)
def process_blogposts(threads=1, force=False, onlyTasks=None): project = HypergolProject( dataDirectory=f'{os.environ["BASE_DIR"]}/tempdata', force=force) SOURCE_PATTERN = f'{os.environ["BASE_DIR"]}/data/blogposts/pages_*.pkl' articles = project.datasetFactory.get(dataType=Article, name='articles') articleTexts = project.datasetFactory.get(dataType=ArticleText, name='article_texts') articlePages = project.datasetFactory.get(dataType=ArticlePage, name='article_pages') sentences = project.datasetFactory.get(dataType=Sentence, name='sentences') loadHtmlPagesTask = LoadHtmlPagesTask(outputDataset=articlePages, sourcePattern=SOURCE_PATTERN) createArticleTextsTask = CreateArticleTextsTask( inputDatasets=[articlePages], outputDataset=articleTexts, ) createArticlesTask = CreateArticlesTask(inputDatasets=[articleTexts], outputDataset=articles, spacyModelName='en_core_web_sm', threads=2) createSentencesTask = CreateSentencesTask( inputDatasets=[articles], outputDataset=sentences, ) pipeline = Pipeline(tasks=[ loadHtmlPagesTask, createArticleTextsTask, createArticlesTask, createSentencesTask, ]) pipeline.run(threads=threads, onlyTasks=onlyTasks)
def create_embedding_model(sourceDataDirectory, modelDirectory, loadModelFile=None, threads=1, force=False): slack_message(message='Processing start') logger = Logger() project = HypergolProject(dataDirectory=sourceDataDirectory, force=force) documents = project.datasetFactory.get(dataType=Document, branch='document_creation', name='documents', chunkCount=256) logger.info('Loading dataset - START') taggedData = [] with documents.open('r') as dsr: for document in tqdm(dsr, total=21_000_000): # for document in tqdm(islice(dsr, 100_000), total=100_000): taggedData.append( TaggedDocument(words=document.tokens, tags=document.labels)) logger.info('Loading dataset - END') modelName = f'doc2vec_{date.today().strftime("%Y%m%d")}_{project.repoManager.commitHash}' if loadModelFile is None: logger.info('Model construction - START') model = Doc2Vec(dm=0, dbow_words=1, dm_concat=0, vector_size=VECTOR_SIZE, window=5, negative=20, hs=0, min_count=3, workers=31, epochs=50, alpha=0.025, min_alpha=0.001, callbacks=[ EpochSaver(modelDirectory=modelDirectory, modelName=modelName) ]) model.build_vocab(taggedData) logger.info('Model construction - END') else: logger.info('Model loading - START') model = Doc2Vec.load(loadModelFile) model.callbacks = [ EpochSaver(modelDirectory=modelDirectory, modelName=modelName, epoch=model.callbacks[0].epoch + 1) ] logger.info('Model loading - END') slack_message(message='Training start') logger.info('Model training - START') model.train(documents=taggedData, total_examples=model.corpus_count, epochs=model.epochs) logger.info('Model training - END')
def process_hacker_news(filePattern, dataDirectory, threads=1, force=False): project = HypergolProject(dataDirectory=dataDirectory, force=force) rawData = project.datasetFactory.get(dataType=RawData, name='raw_data', chunkCount=256) comments = project.datasetFactory.get(dataType=Comment, name='comments', chunkCount=256) stories = project.datasetFactory.get(dataType=Story, name='stories', chunkCount=256) documents = project.datasetFactory.get(dataType=Document, name='documents', chunkCount=256) loadData = LoadData(logAtEachN=200_000, filePattern=filePattern, outputDataset=rawData) selectStories = SelectStories( inputDatasets=[rawData], outputDataset=stories, ) selectComments = SelectComments( inputDatasets=[rawData], outputDataset=comments, ) processWithSpacy = ProcessWithSpacy( logAtEachN=10_000, spacyModelName='en_core_web_sm', inputDatasets=[comments], outputDataset=documents, ) pipeline = Pipeline(tasks=[ # loadData, # selectStories, # selectComments, # processWithSpacy ]) pipeline.run(threads=threads)
def train_my_test_model(force=False): project = HypergolProject( dataDirectory=f'{os.environ["BASE_DIR"]}/tempdata', force=force) VOCABULARY_PATH = f'{os.environ["BASE_DIR"]}/tempdata/vocabulary.json' POS_VOCABULARY_PATH = f'{os.environ["BASE_DIR"]}/tempdata/pos_vocabulary.json' batchProcessor = MyTestModelBatchProcessor( inputDataset=project.datasetFactory.get(dataType=Sentence, name='sentences'), inputBatchSize=16, maxTokenCount=100, outputDataset=project.datasetFactory.get(dataType=EvaluationOutput, name='outputs')) embeddingDimension = 256 vocabulary = json.load(open(VOCABULARY_PATH, 'r')) posVocabulary = json.load(open(POS_VOCABULARY_PATH, 'r')) myTestModel = MyTestModel( modelName=MyTestModel.__name__, longName= f'{MyTestModel.__name__}_{date.today().strftime("%Y%m%d")}_{project.repoManager.commitHash}', inputDatasetChkFileChecksum= f'{batchProcessor.inputDataset.chkFile.get_checksum()}', embeddingBlock=EmbeddingBlock(vocabulary=vocabulary, embeddingDimension=embeddingDimension), lstmBlock=LstmBlock(embeddingDimension=embeddingDimension, layerCount=2, posTypeCount=len(posVocabulary), dropoutRate=0.1), outputBlock=OutputBlock(posTypes=posVocabulary)) modelManager = TensorflowModelManager( model=myTestModel, optimizer=tf.keras.optimizers.Adam(lr=1), batchProcessor=batchProcessor, project=project, restoreWeightsPath=None) modelManager.run(stepCount=100, evaluationSteps=list(range(0, 100, 10)), tracingSteps=list(range(0, 100, 5)))
def train_my_test_model(force=False): project = HypergolProject(dataDirectory='.', force=force) batchProcessor = MyTestModelBatchProcessor( inputDataset=project.datasetFactory.get(dataType=Sentence, name='inputs'), inputBatchSize=16, outputDataset=project.datasetFactory.get(dataType=EvaluationOutput, name='outputs'), exampleArgument='' ) myTestModel = MyTestModel( modelName=MyTestModel.__name__, longName=f'{MyTestModel.__name__}_{date.today().strftime("%Y%m%d")}_{project.repoManager.commitHash}', inputDatasetChkFileChecksum=f'{batchProcessor.inputDataset.chkFile.get_checksum()}', embeddingBlock=EmbeddingBlock( blockArgument1='', blockArgument2='', ), lstmBlock=LstmBlock( blockArgument1='', blockArgument2='', ), outputBlock=OutputBlock( blockArgument1='', blockArgument2='', ), ) modelManager = TensorflowModelManager( model=myTestModel, optimizer=tf.keras.optimizers.Adam(lr=1), batchProcessor=batchProcessor, project=project, restoreWeightsPath=None ) modelManager.run( stepCount=100, evaluationSteps=list(range(0, 100, 10)), tracingSteps=list(range(0, 100, 5)) )