コード例 #1
0
def run_featurize_patents(spark: SparkSession):
    logger.info("Starting execution")
    full = read(spark=spark,
                storage_name=FILTERED_STORAGE_NAME,
                containter_name=FILTERED_CONTAINER_NAME,
                output_folder=FILTERED_OUTPUT_FOLDER,
                logger=logger)
    full = process_full(full)

    text = read(spark=spark,
                storage_name=PROCESSED_TEXT_STORAGE_NAME,
                containter_name=PROCESSED_TEXT_CONTAINER_NAME,
                output_folder=PROCESSED_TEXT_OUTPUT_FOLDER,
                logger=logger)
    text = process_text(text)
    result = full.join(text, ["_file"], "inner")

    save(spark=spark,
         df=result,
         num_files=NUM_OUTPUT_FILES,
         containter_name=FEATURES_CONTAINER_NAME,
         storage_name=FEATURES_STORAGE_NAME,
         output_folder=FEATURES_OUTPUT_FOLDER,
         logger=logger)
    logger.info("Process finished!")
    return result
コード例 #2
0
def run_parquetizer(spark: SparkSession):
    logger.info("Starting execution")
    df = read(spark)
    result = process(df)
    save(spark=spark, df=result, num_files=NUM_OUTPUT_FILES, containter_name=PARQUET_CONTAINER_NAME,
         storage_name=PARQUET_STORAGE_NAME, output_folder=PARQUET_OUTPUT_FOLDER, logger=logger)
    logger.info("Process finished!")
    return result
コード例 #3
0
def process(input_folder: str, output_file: str):
    with open(output_file, "w", encoding="utf-8") as f:
        for (root, directories, file_names) in os.walk(input_folder):
            for file in file_names:
                if file.endswith(".xml"):  # To exclude xds and other files
                    with open(os.path.join(root, file), "r",
                              encoding="utf-8") as input_file:
                        data = input_file.read()
                    output = sanitize_xml(data)
                    f.write(output)
    logger.info("Process completed!")
コード例 #4
0
def log_language_distribution(df: DataFrame, field_name: str):
    """Generates a log with the distribution of languages"""
    logger.info(f"Getting language distribution for: {field_name}")
    if isinstance(df.select(field_name).schema.fields[0].dataType, ArrayType):
        languages = df.select(
            sf.explode_outer(field_name).alias("target_field"))
    else:
        languages = df.select(sf.col(field_name).alias("target_field"))
    languages_p = languages.groupby("target_field._lang").count().toPandas()
    logger.info(
        f"Distribution of languages in {field_name}:\n{languages_p.to_string()}"
    )
コード例 #5
0
def run_frequent_words(spark: SparkSession):
    logger.info("Starting execution")
    df = read(spark=spark,
              storage_name=PROCESSED_TEXT_STORAGE_NAME,
              containter_name=PROCESSED_TEXT_CONTAINER_NAME,
              output_folder=PROCESSED_TEXT_OUTPUT_FOLDER,
              logger=logger)

    result_p = process(df)

    save(result_p)
    logger.info("Process finished!")
    return result_p
コード例 #6
0
def save_ml_model(spark, model, storage_name: str, container_name: str,
                  output_folder, output_suffix):
    """Saves a spark model to a blob storage"""
    key = spark.conf.get(
        f"spark.hadoop.fs.azure.account.key.{storage_name}.blob.core.windows.net"
    )
    create_if_not_exists_container(storage_name,
                                   container_name=container_name,
                                   key=key,
                                   logger=logger)
    output_path = f"wasbs://{container_name}@{storage_name}.blob.core.windows.net/{output_folder}_{output_suffix}/"
    model.write().overwrite().save(output_path)
    logger.info(f"Model saved in: {output_path}")
コード例 #7
0
def run_energy_classifier(spark: SparkSession):
    logger.info("Starting execution")
    df = read(spark=spark,
              storage_name=FEATURES_STORAGE_NAME,
              containter_name=FEATURES_CONTAINER_NAME,
              output_folder=FEATURES_OUTPUT_FOLDER,
              logger=logger)
    result = process(df)
    save(spark=spark,
         df=result,
         num_files=NUM_OUTPUT_FILES,
         containter_name=ENERGY_PATENTS_CONTAINER_NAME,
         storage_name=ENERGY_PATENTS_STORAGE_NAME,
         output_folder=ENERGY_PATENTS_OUTPUT_FOLDER,
         logger=logger)
    logger.info("Process finished!")
    return result
コード例 #8
0
def unzip_data(target_file: str, output_folder: str) -> Tuple[str, int]:
    num_files = 0
    if target_file.endswith("tgz"):
        tar = tarfile.open(target_file, "r:gz")
        for member in tar.getmembers():
            if member.isreg():  # skip if the TarInfo is not files
                member.name = os.path.basename(
                    member.name)  # remove the path by reset it
                logger.debug(f"Extracting {member.name} into {output_folder}")
                tar.extract(member, output_folder)
                num_files += 1
            else:
                logger.info(f"Omitting file {member.name}")
    else:
        logger.error(f"Unsupported file extension: {target_file}")
    logger.info(f"Unzip completed! {num_files} files into {target_file}")
    return target_file, num_files
コード例 #9
0
def process(df: DataFrame) -> DataFrame:
    """
    Process text columns and generates two columns per input column with the result after StopWords and Lemmatization
    """
    cols = [
        OUTPUT_COL_ENGLISH_TEXT, OUTPUT_COL_ENGLISH_ABSTRACT_TEXT,
        OUTPUT_COL_ENGLISH_TITLE_TEXT, OUTPUT_COL_ENGLISH_CLAIMS_TEXT
    ]
    df = df.select("_file", *cols)
    # Initializad only once because it downloads data each time
    lemma = LemmatizerModel.pretrained(name="lemma_antbnc",
                                       lang="en").setInputCols(
                                           ["stopwords"]).setOutputCol("lemma")
    for col in cols:
        logger.info(f"Processing column: {col}")
        df = process_col(df=df, input_col=col, lemma=lemma)
    return df
コード例 #10
0
def run_text_processor(spark: SparkSession):
    logger.info("Starting execution")
    df = read(spark=spark,
              storage_name=FILTERED_STORAGE_NAME,
              containter_name=FILTERED_CONTAINER_NAME,
              output_folder=FILTERED_OUTPUT_FOLDER,
              logger=logger)
    result = process(df)
    save(spark=spark,
         df=result,
         num_files=NUM_OUTPUT_FILES,
         containter_name=PROCESSED_TEXT_CONTAINER_NAME,
         storage_name=PROCESSED_TEXT_STORAGE_NAME,
         output_folder=PROCESSED_TEXT_OUTPUT_FOLDER,
         logger=logger)
    logger.info("Process finished!")
    return result
コード例 #11
0
def process(df: DataFrame) -> DataFrame:
    container_path = f"wasbs://{ENERGY_CLASSIFIER_CONTAINER_NAME}@{ENERGY_CLASSIFIER_STORAGE_NAME}.blob.core.windows.net"
    blob_folder = f"{container_path}/{ENERGY_CLASSIFIER_OUTPUT_FOLDER}/"
    model = PipelineModel.load(blob_folder)
    result = model.transform(df)
    result = result.cache()
    num_pos = result.filter(sf.col("prediction") == 1)
    num_neg = result.filter(sf.col("prediction") == 0)
    if num_pos == 0:  # TODO parametrize. Maybe min percentage?
        logger.warning(f"There are {num_pos} positives")
    else:
        logger.info(f"There are {num_pos} positives")
    if num_neg == 0:  # TODO parametrize. Maybe min percentage?
        logger.warning(f"There are {num_neg} negatives")
    else:
        logger.info(f"There are {num_neg} negatives")
    return result
コード例 #12
0
def save_results_lda(df_p: pd.DataFrame, key: str, list_num_topics: List[int]):
    """Saves a csv file to blob storage with the information of the results of all topics"""
    output_file = TOPIC_CLUSTERING_OUTPUT_LDA_RESULT_PREFIX + "_".join(
        [str(n) for n in list_num_topics]) + ".csv"
    logger.info(f"Saving local data into {output_file}")
    df_p.to_csv(output_file,
                header=True,
                index=False,
                sep=";",
                encoding="utf-8")

    logger.info(f"Uploading data...")
    output_url = get_account_url(TOPIC_CLUSTERING_STORAGE_NAME)
    output_service = BlobServiceClient(account_url=output_url, credential=key)
    output_container = output_service.get_container_client(
        TOPIC_CLUSTERING_CONTAINER_NAME)
    upload_blob_client = output_container.get_blob_client(output_file)
    with open(output_file, "rb") as data:
        upload_blob_client.upload_blob(data,
                                       blob_type="BlockBlob",
                                       overwrite=True)
    logger.info("Upload completed!")
コード例 #13
0
def save(df_p: pd.DataFrame):
    logger.info(f"Saving local data into {FREQUENT_WORDS_OUTPUT_FILE_NAME}")
    assert FREQUENT_WORDS_OUTPUT_FILE_NAME.endswith(".csv")
    df_p.to_csv(FREQUENT_WORDS_OUTPUT_FILE_NAME,
                header=True,
                index=False,
                sep=",",
                encoding="utf-8")

    logger.info(f"Uploading data...")
    output_url = get_account_url(FREQUENT_WORDS_STORAGE_NAME)
    output_service = BlobServiceClient(account_url=output_url,
                                       credential=FREQUENT_WORDS_STORAGE_KEY)
    output_container = output_service.get_container_client(
        FREQUENT_WORDS_CONTAINER_NAME)
    upload_blob_client = output_container.get_blob_client(
        FREQUENT_WORDS_OUTPUT_FILE_NAME)
    with open(FREQUENT_WORDS_OUTPUT_FILE_NAME, "rb") as data:
        upload_blob_client.upload_blob(data,
                                       blob_type="BlockBlob",
                                       overwrite=True)
    logger.info("Upload completed!")
コード例 #14
0
def run_sanitize_data():
    logger.info("Starting execution")
    input_url = get_account_url(INPUT_STORAGE_NAME)
    service = BlobServiceClient(account_url=input_url,
                                credential=INPUT_STORAGE_KEY)
    container = service.get_container_client(INPUT_CONTAINER_NAME)
    output_url = get_account_url(SANITIZED_STORAGE_NAME)
    output_service = BlobServiceClient(account_url=output_url,
                                       credential=SANITIZED_STORAGE_KEY)
    output_container = output_service.get_container_client(
        SANITIZED_CONTAINER_NAME)
    try:
        output_container.create_container()
        logger.info(f"Creating container: {SANITIZED_CONTAINER_NAME}")
    except ResourceExistsError:
        logger.warning("Output container already exists")

    blobs = list(container.list_blobs())
    logger.info(f"There are {len(blobs)} blobs to process")
    info_unzip_num_files = []
    for n, blob in enumerate(blobs):
        try:
            blob_name = blob["name"]
            logger.info(
                f"Processing blob {n + 1} of {len(blobs)}: {blob_name}")
        except KeyError:
            logger.error(f"Omitting blob, it doesn't have a name: {blob}")
            continue
        blob = container.get_blob_client(blob=blob_name)
        init_local_directories()

        # Download
        target_blob_file = os.path.join(output_tmp_folder_blob, blob_name)
        logger.info(f"Downloading {blob_name} into {target_blob_file}")
        download_data(blob=blob, target_file=target_blob_file)

        # Process
        unzip_info = unzip_data(target_file=target_blob_file,
                                output_folder=output_tmp_folder_xmls)
        info_unzip_num_files.append(unzip_info)
        output_xml_file = os.path.splitext(
            os.path.basename(blob_name))[0] + ".xml"
        process(input_folder=output_tmp_folder_xmls,
                output_file=output_xml_file)

        # Upload
        upload_blob_client = output_container.get_blob_client(output_xml_file)
        with open(output_xml_file, "rb") as data:
            upload_blob_client.upload_blob(data,
                                           blob_type="BlockBlob",
                                           overwrite=True)
        logger.info("Upload completed!")
        os.remove(output_xml_file)
        logger.info("Local file deleted!")

    logger.info("Uploaded info:")
    total_num_registers = 0
    for name, num_files in info_unzip_num_files:
        logger.info(f"File {name} -> {num_files} registers")
        total_num_registers += num_files
    logger.info(f"Total registers uploaded: {total_num_registers}")
    logger.info("Process finished!")
コード例 #15
0
def download_data(blob, target_file: str):
    with open(target_file, "wb") as my_blob:
        blob_data = blob.download_blob()
        blob_data.readinto(my_blob)
    logger.info(f"Download completed!")
コード例 #16
0
def read(spark: SparkSession) -> DataFrame:
    input_path = f"wasbs://{SANITIZED_CONTAINER_NAME}@{SANITIZED_STORAGE_NAME}.blob.core.windows.net/"
    logger.info(f"Reading from: {input_path}")
    df = spark.read.format("com.databricks.spark.xml").option("rowTag", "questel-patent-document").option(
        "mode", "DROPMALFORMED").load(input_path)
    return df
コード例 #17
0
def run_energy_clustering(spark: SparkSession,
                          list_num_topics: List[int]) -> pd.DataFrame:
    logger.info("Starting execution")
    df = read(spark=spark,
              storage_name=ENERGY_PATENTS_STORAGE_NAME,
              containter_name=ENERGY_PATENTS_CONTAINER_NAME,
              output_folder=ENERGY_PATENTS_OUTPUT_FOLDER,
              logger=logger)

    min_df = 0.05
    df = df.select("_file", "prediction", "english_text_features")
    df = df.filter(sf.col("prediction") == 1)
    cv = CountVectorizer(inputCol="english_text_features",
                         outputCol="features",
                         minDF=min_df)
    cv_model = cv.fit(df)
    df_vectorized = cv_model.transform(df)
    df_vectorized.persist(StorageLevel.DISK_ONLY)
    logger.info(f"Vocabulary size: {len(cv_model.vocabulary)}")
    save_ml_model(spark=spark,
                  model=cv_model,
                  storage_name=TOPIC_CLUSTERING_STORAGE_NAME,
                  container_name=TOPIC_CLUSTERING_CONTAINER_NAME,
                  output_folder=TOPIC_CLUSTERING_OUTPUT_CV,
                  output_suffix=min_df)

    results_log_likelihood = []
    results_log_perplexity = []
    for n in list_num_topics:
        lda = LDA(k=n, maxIter=100, seed=18)
        model = lda.fit(df_vectorized)

        ll = model.logLikelihood(df_vectorized)
        lp = model.logPerplexity(df_vectorized)
        logger.info(f"Num topics: {n}")
        logger.info(
            f"The lower bound on the log likelihood of the entire corpus: {ll}"
        )
        logger.info(f"The upper bound on perplexity: {lp}")
        results_log_likelihood.append(ll)
        results_log_perplexity.append(lp)
        save_ml_model(spark=spark,
                      model=cv_model,
                      storage_name=TOPIC_CLUSTERING_STORAGE_NAME,
                      container_name=TOPIC_CLUSTERING_CONTAINER_NAME,
                      output_folder=TOPIC_CLUSTERING_OUTPUT_LDA,
                      output_suffix=n)

    data = {
        "num_topics": list_num_topics,
        "log_likelihood": results_log_likelihood,
        "log_perplexity": results_log_perplexity,
    }
    result_p = pd.DataFrame(data)
    key = spark.conf.get(
        f"spark.hadoop.fs.azure.account.key.{TOPIC_CLUSTERING_STORAGE_NAME}.blob.core.windows.net"
    )
    save_results_lda(result_p, key=key, list_num_topics=list_num_topics)

    logger.info("Process finished!")
    return result_p
コード例 #18
0
                sep=";",
                encoding="utf-8")

    logger.info(f"Uploading data...")
    output_url = get_account_url(TOPIC_CLUSTERING_STORAGE_NAME)
    output_service = BlobServiceClient(account_url=output_url, credential=key)
    output_container = output_service.get_container_client(
        TOPIC_CLUSTERING_CONTAINER_NAME)
    upload_blob_client = output_container.get_blob_client(output_file)
    with open(output_file, "rb") as data:
        upload_blob_client.upload_blob(data,
                                       blob_type="BlockBlob",
                                       overwrite=True)
    logger.info("Upload completed!")


if __name__ == '__main__':
    if len(sys.argv) > 1:
        list_num_topics = []
        for n in range(1, len(sys.argv)):
            list_num_topics.append(int(sys.argv[n]))
    else:
        list_num_topics = [
            2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 30, 50, 100
        ]
    logger.info(
        f"Executing with the following list of number of topics: {list_num_topics}"
    )
    spark_session = create_spark_session("energy_clustering")
    run_energy_clustering(spark_session, list_num_topics)