def compute_document_frequency(input_dir, output_file, format="corenlp", extension="xml", use_lemmas=False, stemmer="porter", stoplist=None, delimiter='\t', n=3): """ Compute n-gram document frequencies from a set of input documents. An extra row is added to the output file for specifying the number of documents from which the frequencies were computed (--NB_DOC-- tab XX). Args: input_dir (str): the input directory. output_file (str): the output file. format (str): the input files format, defaults to corenlp. extension (str): file extension for input documents, defaults to xml. use_lemmas (bool): whether lemmas from stanford corenlp are used instead of stems (computed by nltk), defaults to False. stemmer (str): the stemmer in nltk to used (if used), defaults to porter. stoplist (list): the stop words for filtering n-grams, default to None. delimiter (str): the delimiter between n-grams and document frequencies, default to tabulation. n (int): the length for ngrams, defaults to 3. """ # document frequency container frequencies = defaultdict(set) # initialize number of documents nb_documents = 0 # loop throught the documents for input_file in glob.glob(input_dir+'/*.'+extension): logging.info('reading file '+input_file) # initialize load file object doc = LoadFile(input_file) # read the input file doc.read_document(format=format, use_lemmas=use_lemmas, stemmer=stemmer, sep='/') # candidate selection doc.ngram_selection(n=n) # filter candidates containing punctuation marks doc.candidate_filtering(stoplist=stoplist) # loop through candidates for lexical_form in doc.candidates: frequencies[lexical_form].add(input_file) nb_documents += 1 # Dump the df container with gzip.open(output_file, 'wb') as f: # add the number of documents as special token first_line = '--NB_DOC--' + delimiter + str(nb_documents) f.write(first_line.encode('utf-8') + b'\n') for ngram in frequencies: line = ngram + delimiter + str(len(frequencies[ngram])) f.write(line.encode('utf-8') + b'\n')
def compute_document_frequency( documents, output_file, language='en', stoplist=None, normalization='stemming', delimiter='\t', # TODO: What is the use case for changing this ? n=3): """Compute the n-gram document frequencies from a set of input documents. An extra row is added to the output file for specifying the number of documents from which the document frequencies were computed (--NB_DOC-- tab XXX). The output file is compressed using gzip. Args: documents (list): list of pke-readable documents. output_file (str): the output file. language (str): language of the input documents (used for computing the n-stem or n-lemma forms), defaults to 'en' (english). stoplist (list): the stop words for filtering n-grams, default to pke.lang.stopwords[language]. normalization (str): word normalization method, defaults to 'stemming'. Other possible value is 'none' for using word surface forms instead of stems/lemmas. delimiter (str): the delimiter between n-grams and document frequencies, defaults to tabulation (\t). n (int): the size of the n-grams, defaults to 3. """ # document frequency container frequencies = defaultdict(int) # initialize number of documents nb_documents = 0 # loop through the documents for document in documents: # initialize load file object doc = LoadFile() # read the input file doc.load_document(input=document, language=language, stoplist=stoplist, normalization=normalization) # candidate selection doc.ngram_selection(n=n) # filter candidates containing punctuation marks doc.candidate_filtering() # loop through candidates for lexical_form in doc.candidates: frequencies[lexical_form] += 1 nb_documents += 1 if nb_documents % 1000 == 0: logging.info("{} docs, memory used: {} mb".format( nb_documents, sys.getsizeof(frequencies) / 1024 / 1024)) # create directories from path if not exists if os.path.dirname(output_file): os.makedirs(os.path.dirname(output_file), exist_ok=True) # dump the df container with gzip.open(output_file, 'wt', encoding='utf-8') as f: # add the number of documents as special token first_line = '--NB_DOC--' + delimiter + str(nb_documents) f.write(first_line + '\n') for ngram in frequencies: line = ngram + delimiter + str(frequencies[ngram]) f.write(line + '\n')
def compute_document_frequency(input_dir, output_file, extension='xml', language='en', normalization="stemming", stoplist=None, delimiter='\t', n=3, max_length=10**6, encoding=None): """Compute the n-gram document frequencies from a set of input documents. An extra row is added to the output file for specifying the number of documents from which the document frequencies were computed (--NB_DOC-- tab XXX). The output file is compressed using gzip. Args: input_dir (str): the input directory. output_file (str): the output file. extension (str): file extension for input documents, defaults to xml. language (str): language of the input documents (used for computing the n-stem or n-lemma forms), defaults to 'en' (english). normalization (str): word normalization method, defaults to 'stemming'. Other possible values are 'lemmatization' or 'None' for using word surface forms instead of stems/lemmas. stoplist (list): the stop words for filtering n-grams, default to None. delimiter (str): the delimiter between n-grams and document frequencies, defaults to tabulation (\t). n (int): the size of the n-grams, defaults to 3. encoding (str): encoding of files in input_dir, default to None. """ # document frequency container frequencies = defaultdict(int) # initialize number of documents nb_documents = 0 # loop through the documents for input_file in glob.iglob(input_dir + os.sep + '*.' + extension): #logging.info('reading file {}'.format(input_file)) # initialize load file object doc = LoadFile() # read the input file doc.load_document(input=input_file, language=language, normalization=normalization, max_length=max_length, encoding=encoding) # candidate selection doc.ngram_selection(n=n) # filter candidates containing punctuation marks doc.candidate_filtering(stoplist=stoplist) # loop through candidates for lexical_form in doc.candidates: frequencies[lexical_form] += 1 nb_documents += 1 if nb_documents % 1000 == 0: logging.info("{} docs, memory used: {} mb".format( nb_documents, sys.getsizeof(frequencies) / 1024 / 1024)) # create directories from path if not exists if os.path.dirname(output_file): os.makedirs(os.path.dirname(output_file), exist_ok=True) # dump the df container with gzip.open(output_file, 'wt', encoding='utf-8') as f: # add the number of documents as special token first_line = '--NB_DOC--' + delimiter + str(nb_documents) f.write(first_line + '\n') for ngram in frequencies: line = ngram + delimiter + str(frequencies[ngram]) f.write(line + '\n')