def TrainingKEAModel(self, pathToCollectionOfDocs, groundTruthFile, lang, normalization, pathToDFFile, pathToKEAFile, pathToKeaModelsFolder): print(f"\nSTEP 2: Compute Document Frequency") ComputeDF(pathToCollectionOfDocs, lang, normalization, pathToDFFile) df = pke.load_document_frequency_file(input_file=pathToDFFile) print( f"\nSTEP 3: Train KEA Model on top of the following set of docs: {pathToCollectionOfDocs}" ) if os.path.exists(pathToKEAFile): print(f"KEA Model File already exists here: {pathToKEAFile} ") else: print( f"KEA Model doesn't exists. Let's create here: {pathToCollectionOfDocs}. It may take a while." ) # If folder Models does not exist: Create it if not os.path.exists(pathToKeaModelsFolder): os.makedirs(pathToKeaModelsFolder) pke.train_supervised_model(input_dir=pathToCollectionOfDocs, reference_file=groundTruthFile, model_file=pathToKEAFile, extension='txt', language=lang, normalization=normalization, df=df, model=pke.supervised.Kea())
import codecs import logging import pke # setting info in terminal logging.basicConfig(level=logging.INFO) # path to the collection of documents input_dir = sys.argv[1] # path to the reference file reference_file = sys.argv[2] # path to the df file df_file = sys.argv[3] logging.info('loading df counts from '+df_file) df_counts = pke.load_document_frequency_file(df_file, delimiter='\t') # path to the model, saved as a pickle output_mdl = sys.argv[4] pke.train_supervised_model(input_dir=input_dir, reference_file=reference_file, model_file=output_mdl, df=df_counts, format="corenlp", use_lemmas=False, stemmer="porter", model=pke.supervised.Kea() language='english', extension="xml")
import logging import pke # setting info in terminal logging.basicConfig(level=logging.INFO) # path to the collection of documents input_dir = 'train/' # path to the reference file reference_file = "gold-annotation.txt" # path to the df file df_file = "df.tsv.gz" logging.info('Loading df counts from {}'.format(df_file)) df_counts = pke.load_document_frequency_file(input_file=df_file, delimiter='\t') # path to the model, saved as a pickle output_mdl = "model.pickle" pke.train_supervised_model(input_dir=input_dir, reference_file=reference_file, model_file=output_mdl, df=df_counts, format="corenlp", use_lemmas=False, stemmer="porter", model=pke.supervised.Kea(), language='english', extension="xml")
import pke #import logging ## Training the model on train set. #train_input_dir = 'drive/My Drive/Recommendation systems/kea_trained/train_doc/' reference_file = 'drive/My Drive/Recommendation systems/kea_trained/reference.txt' output_mdl = "drive/My Drive/Recommendation systems/kea_trained/Models/kea_model.pickle" #train_df_file = 'drive/My Drive/Recommendation systems/kea_trained/train_DF.tsv.gz' #logging.info('Loading df counts from {}'.format(df_file)) df_counts = pke.load_document_frequency_file(input_file='train_DF.tsv.gz', delimiter='\t') pke.train_supervised_model(input_dir='train_doc/', reference_file='reference.txt', model_file='model/kea_model.pickle', extension='txt', language='en', normalization="stemming", df=df_counts, model=pke.supervised.Kea())
import codecs import logging import pke # setting info in terminal logging.basicConfig(level=logging.INFO) # path to the collection of documents input_dir = sys.argv[1] # path to the reference file reference_file = sys.argv[2] # path to the df file df_file = sys.argv[3] logging.info('loading df counts from '+df_file) df_counts = pke.load_document_frequency_file(df_file, delimiter='\t') # path to the model, saved as a pickle output_mdl = sys.argv[4] pke.train_supervised_model(input_dir=input_dir, reference_file=reference_file, model_file=output_mdl, df=df_counts, format="corenlp", use_lemmas=False, stemmer="porter", model=pke.Kea() language='english', extension="xml")
for fn in glob(base + os.sep + 'train/*.txt'): with open(fn) as f: doc = f.read() doc_id = os.path.basename(fn).rsplit('.', 1)[0] documents.append((doc_id, doc)) logging.info('Loaded {} documents'.format(len(documents))) # path to the reference file reference = {} with open(base + os.sep + 'gold-annotation.txt') as f: for line in f: doc_id, keywords = line.split(' : ') reference[doc_id] = keywords.split(',') # path to the df file df_file = base + os.sep + 'df.tsv.gz' logging.info('Loading df counts from {}'.format(df_file)) df_counts = pke.load_document_frequency_file(input_file=df_file, delimiter='\t') # path to the model, saved as a pickle output_mdl = base + os.sep + 'model.pickle' pke.train_supervised_model(documents, reference, model_file=output_mdl, language='en', normalization='stemming', df=df_counts, model=pke.supervised.Kea())
############################################################################### if not only_test: # Training a supervised Kea model if not os.path.isfile(path_to_kea_file): logging.info("Training supervised model {}".format(path_to_kea_file)) logging.info("loading DF counts from {}".format(path_to_df_file)) df_counts = pke.load_document_frequency_file( input_file=path_to_df_file) pke.train_supervised_model(input_dir=path_to_train, reference_file=params["reference"], model_file=path_to_kea_file, extension=params["extension"], language=params["language"], normalization=params["normalization"], df=df_counts, model=pke.supervised.Kea()) else: # No training set is available if not os.path.isdir(path_to_leave_one_out_models): os.makedirs(path_to_leave_one_out_models) logging.info("Training LOO models {}".format( path_to_leave_one_out_models)) logging.info("loading DF counts from {}".format(path_to_df_file)) df_counts = pke.load_document_frequency_file( input_file=path_to_df_file)
import logging import pke # setting info in terminal logging.basicConfig(level=logging.INFO) # path to the collection of documents input_dir = 'train' + os.sep # path to the reference file reference_file = "gold-annotation.txt" # path to the df file df_file = "df.tsv.gz" logging.info('Loading df counts from {}'.format(df_file)) df_counts = pke.load_document_frequency_file(input_file=df_file, delimiter='\t') # path to the model, saved as a pickle output_mdl = "model.pickle" pke.train_supervised_model(input_dir=input_dir, reference_file=reference_file, model_file=output_mdl, extension='xml', language='en', normalization="stemming", df=df_counts, model=pke.supervised.Kea())