Example #1
0
def create_schema(data_directory, raw_data_location, threads=1, force=False):
    project = HypergolProject(dataDirectory=data_directory, force=force)

    rawMetadata = project.datasetFactory.get(dataType=RawMetadata,
                                             name='raw_metadata',
                                             chunkCount=256)
    rawData = project.datasetFactory.get(dataType=RawData,
                                         name='raw_data',
                                         chunkCount=256)

    createRawMetadata = CreateRawMetadata(rawDataLocation=raw_data_location,
                                          splits=threads,
                                          outputDataset=rawMetadata)

    createRawData = CreateRawData(rawDataLocation=raw_data_location,
                                  plurals=PLURALS,
                                  inputDatasets=[rawMetadata],
                                  outputDataset=rawData,
                                  debug=True)

    pipeline = Pipeline(tasks=[
        createRawMetadata,
        createRawData,
    ])
    pipeline.run(threads=threads)
Example #2
0
def process_blogposts(threads=1, force=False, onlyTasks=None):
    project = HypergolProject(dataDirectory='.', force=force)
    articles = project.datasetFactory.get(dataType=Article, name='articles')
    articleTexts = project.datasetFactory.get(dataType=ArticleText,
                                              name='article_texts')
    articlePages = project.datasetFactory.get(dataType=ArticlePage,
                                              name='article_pages')
    sentences = project.datasetFactory.get(dataType=Sentence, name='sentences')
    loadHtmlPagesTask = LoadHtmlPagesTask(
        inputDatasets=[exampleInputDataset1, exampleInputDataset2],
        outputDataset=exampleOutputDataset,
    )
    createArticleTextsTask = CreateArticleTextsTask(
        inputDatasets=[exampleInputDataset1, exampleInputDataset2],
        outputDataset=exampleOutputDataset,
    )
    createArticlesTask = CreateArticlesTask(
        inputDatasets=[exampleInputDataset1, exampleInputDataset2],
        outputDataset=exampleOutputDataset,
    )
    createSentencesTask = CreateSentencesTask(
        inputDatasets=[exampleInputDataset1, exampleInputDataset2],
        outputDataset=exampleOutputDataset,
    )

    pipeline = Pipeline(tasks=[
        loadHtmlPagesTask,
        createArticleTextsTask,
        createArticlesTask,
        createSentencesTask,
    ])
    pipeline.run(threads=threads, onlyTasks=onlyTasks)
Example #3
0
def process_blogposts(threads=1, force=False, onlyTasks=None):
    project = HypergolProject(
        dataDirectory=f'{os.environ["BASE_DIR"]}/tempdata', force=force)
    SOURCE_PATTERN = f'{os.environ["BASE_DIR"]}/data/blogposts/pages_*.pkl'
    articles = project.datasetFactory.get(dataType=Article, name='articles')
    articleTexts = project.datasetFactory.get(dataType=ArticleText,
                                              name='article_texts')
    articlePages = project.datasetFactory.get(dataType=ArticlePage,
                                              name='article_pages')
    sentences = project.datasetFactory.get(dataType=Sentence, name='sentences')
    loadHtmlPagesTask = LoadHtmlPagesTask(outputDataset=articlePages,
                                          sourcePattern=SOURCE_PATTERN)

    createArticleTextsTask = CreateArticleTextsTask(
        inputDatasets=[articlePages],
        outputDataset=articleTexts,
    )

    createArticlesTask = CreateArticlesTask(inputDatasets=[articleTexts],
                                            outputDataset=articles,
                                            spacyModelName='en_core_web_sm',
                                            threads=2)

    createSentencesTask = CreateSentencesTask(
        inputDatasets=[articles],
        outputDataset=sentences,
    )

    pipeline = Pipeline(tasks=[
        loadHtmlPagesTask,
        createArticleTextsTask,
        createArticlesTask,
        createSentencesTask,
    ])
    pipeline.run(threads=threads, onlyTasks=onlyTasks)
def create_embedding_model(sourceDataDirectory,
                           modelDirectory,
                           loadModelFile=None,
                           threads=1,
                           force=False):
    slack_message(message='Processing start')
    logger = Logger()
    project = HypergolProject(dataDirectory=sourceDataDirectory, force=force)
    documents = project.datasetFactory.get(dataType=Document,
                                           branch='document_creation',
                                           name='documents',
                                           chunkCount=256)

    logger.info('Loading dataset - START')
    taggedData = []
    with documents.open('r') as dsr:
        for document in tqdm(dsr, total=21_000_000):
            # for document in tqdm(islice(dsr, 100_000), total=100_000):
            taggedData.append(
                TaggedDocument(words=document.tokens, tags=document.labels))
    logger.info('Loading dataset - END')

    modelName = f'doc2vec_{date.today().strftime("%Y%m%d")}_{project.repoManager.commitHash}'
    if loadModelFile is None:
        logger.info('Model construction - START')
        model = Doc2Vec(dm=0,
                        dbow_words=1,
                        dm_concat=0,
                        vector_size=VECTOR_SIZE,
                        window=5,
                        negative=20,
                        hs=0,
                        min_count=3,
                        workers=31,
                        epochs=50,
                        alpha=0.025,
                        min_alpha=0.001,
                        callbacks=[
                            EpochSaver(modelDirectory=modelDirectory,
                                       modelName=modelName)
                        ])
        model.build_vocab(taggedData)
        logger.info('Model construction - END')
    else:
        logger.info('Model loading - START')
        model = Doc2Vec.load(loadModelFile)
        model.callbacks = [
            EpochSaver(modelDirectory=modelDirectory,
                       modelName=modelName,
                       epoch=model.callbacks[0].epoch + 1)
        ]
        logger.info('Model loading - END')

    slack_message(message='Training start')
    logger.info('Model training - START')
    model.train(documents=taggedData,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    logger.info('Model training - END')
Example #5
0
def process_hacker_news(filePattern, dataDirectory, threads=1, force=False):
    project = HypergolProject(dataDirectory=dataDirectory, force=force)
    rawData = project.datasetFactory.get(dataType=RawData,
                                         name='raw_data',
                                         chunkCount=256)
    comments = project.datasetFactory.get(dataType=Comment,
                                          name='comments',
                                          chunkCount=256)
    stories = project.datasetFactory.get(dataType=Story,
                                         name='stories',
                                         chunkCount=256)
    documents = project.datasetFactory.get(dataType=Document,
                                           name='documents',
                                           chunkCount=256)

    loadData = LoadData(logAtEachN=200_000,
                        filePattern=filePattern,
                        outputDataset=rawData)

    selectStories = SelectStories(
        inputDatasets=[rawData],
        outputDataset=stories,
    )
    selectComments = SelectComments(
        inputDatasets=[rawData],
        outputDataset=comments,
    )

    processWithSpacy = ProcessWithSpacy(
        logAtEachN=10_000,
        spacyModelName='en_core_web_sm',
        inputDatasets=[comments],
        outputDataset=documents,
    )

    pipeline = Pipeline(tasks=[
        # loadData,
        # selectStories,
        # selectComments,
        # processWithSpacy
    ])
    pipeline.run(threads=threads)
Example #6
0
def train_my_test_model(force=False):
    project = HypergolProject(
        dataDirectory=f'{os.environ["BASE_DIR"]}/tempdata', force=force)
    VOCABULARY_PATH = f'{os.environ["BASE_DIR"]}/tempdata/vocabulary.json'
    POS_VOCABULARY_PATH = f'{os.environ["BASE_DIR"]}/tempdata/pos_vocabulary.json'

    batchProcessor = MyTestModelBatchProcessor(
        inputDataset=project.datasetFactory.get(dataType=Sentence,
                                                name='sentences'),
        inputBatchSize=16,
        maxTokenCount=100,
        outputDataset=project.datasetFactory.get(dataType=EvaluationOutput,
                                                 name='outputs'))
    embeddingDimension = 256
    vocabulary = json.load(open(VOCABULARY_PATH, 'r'))
    posVocabulary = json.load(open(POS_VOCABULARY_PATH, 'r'))
    myTestModel = MyTestModel(
        modelName=MyTestModel.__name__,
        longName=
        f'{MyTestModel.__name__}_{date.today().strftime("%Y%m%d")}_{project.repoManager.commitHash}',
        inputDatasetChkFileChecksum=
        f'{batchProcessor.inputDataset.chkFile.get_checksum()}',
        embeddingBlock=EmbeddingBlock(vocabulary=vocabulary,
                                      embeddingDimension=embeddingDimension),
        lstmBlock=LstmBlock(embeddingDimension=embeddingDimension,
                            layerCount=2,
                            posTypeCount=len(posVocabulary),
                            dropoutRate=0.1),
        outputBlock=OutputBlock(posTypes=posVocabulary))
    modelManager = TensorflowModelManager(
        model=myTestModel,
        optimizer=tf.keras.optimizers.Adam(lr=1),
        batchProcessor=batchProcessor,
        project=project,
        restoreWeightsPath=None)
    modelManager.run(stepCount=100,
                     evaluationSteps=list(range(0, 100, 10)),
                     tracingSteps=list(range(0, 100, 5)))
Example #7
0
def train_my_test_model(force=False):
    project = HypergolProject(dataDirectory='.', force=force)

    batchProcessor = MyTestModelBatchProcessor(
        inputDataset=project.datasetFactory.get(dataType=Sentence, name='inputs'),
        inputBatchSize=16,
        outputDataset=project.datasetFactory.get(dataType=EvaluationOutput, name='outputs'),
        exampleArgument=''
    )
    myTestModel = MyTestModel(
        modelName=MyTestModel.__name__,
        longName=f'{MyTestModel.__name__}_{date.today().strftime("%Y%m%d")}_{project.repoManager.commitHash}',
        inputDatasetChkFileChecksum=f'{batchProcessor.inputDataset.chkFile.get_checksum()}',
        embeddingBlock=EmbeddingBlock(
            blockArgument1='',
            blockArgument2='',
        ),
        lstmBlock=LstmBlock(
            blockArgument1='',
            blockArgument2='',
        ),
        outputBlock=OutputBlock(
            blockArgument1='',
            blockArgument2='',
        ),
    )
    modelManager = TensorflowModelManager(
        model=myTestModel,
        optimizer=tf.keras.optimizers.Adam(lr=1),
        batchProcessor=batchProcessor,
        project=project,
        restoreWeightsPath=None
    )
    modelManager.run(
        stepCount=100,
        evaluationSteps=list(range(0, 100, 10)),
        tracingSteps=list(range(0, 100, 5))
    )