os.makedirs(OUTPUT_DIR) # Number of permutations N = 1000 # The size of the train corpus (in number of sentences) #N_TRAIN = 184 # Read the input file and obtain a list of list of strings, i.e. the list of sentences f = open(INPUT_FILE, 'r') lines = f.readlines() f.close() sentences = [l.strip().split(" ") for l in lines] # Get a list of the bigrams unigrams_unpermuted = compare_bigrams.unigram_counts(sentences) bigrams_unpermuted = compare_bigrams.bigram_counts(sentences) # Determine the length of each sentence lengths = [len(s) for s in sentences] # Put all the words in a flat list flat_sentences = [] for s in sentences: flat_sentences += s print("The corpus has %i words, originally in %s sentences" % (len(flat_sentences), len(sentences))) # First, put all the words in a line
# The file that contains the base corpus INPUT_FILE = "../corpus/cath8.txt" # Each line should be a sentence, with the words separated by spaces # Read the input file and obtain a list of list of strings, i.e. the list of sentences f = open(INPUT_FILE, 'r') lines = f.readlines() f.close() sentences = [l.strip().split(" ") for l in lines] corpus_stats = pd.DataFrame() # Get a list of the bigrams, but omitting those involving word boundaries. bigrams_unpermuted = compare_bigrams.bigram_counts(sentences) unigrams_unpermuted = compare_bigrams.unigram_counts(sentences) for i in range(N_BOOTSTRAP_SAMPLES): print(i, end=" ", flush=True) # Take a bootstrap sample bootstrap_corpus = [] for j in range(len(sentences)): bootstrap_corpus.append(random.choice(sentences)) # So now we have a bootstrap resampled corpus with the same number # of sentences but not necessarily of the same lengths, nor necessarily # with the same unigram or bigram distributions. generated_corpus_bigrams = compare_bigrams.bigram_counts( bootstrap_corpus)
def generate_bigram_corpus(i): print(i, end=" ", flush=True) # First make a candidate corpus corpus = [] total_length = 0 while total_length < N_WORDS_IN_CORPUS: # we obtain a sentence of a particular length # Generate one item sent, _ = list( compare_bigrams.generate_words_from_bigrams( bigrams_unpermuted, 1, maxlength, progress_output=False).values())[0][0] sent = sent.split(".") corpus += [sent] total_length += len(sent) # Ok, so now we have a corpus that is just slightly bigger in number of words than the target corpus. # We consider discarding the last sentence, and seeing if we end up closer to the target length. dlength = abs(total_length - N_WORDS_IN_CORPUS) dlength_discard = abs( (total_length - len(corpus[-1])) - N_WORDS_IN_CORPUS ) # the length of the corpus if we would discard the final addition if dlength_discard < dlength: # if, discarding the last sentence, we are closer to the target length, do the discard corpus = corpus[:-1] total_length = sum([len(s) for s in corpus]) generated_corpus_bigrams = compare_bigrams.bigram_counts(corpus) generated_corpus_unigrams = compare_bigrams.unigram_counts(corpus) comp = compare_bigrams.compare_Ngrams(bigrams_unpermuted, generated_corpus_bigrams) comp = dict([("bigrams.%s" % k, v) for (k, v) in comp.items()]) unicomp = compare_bigrams.compare_Ngrams(unigrams_unpermuted, generated_corpus_unigrams) unicomp = dict([("unigrams.%s" % k, v) for (k, v) in unicomp.items()]) # Also quantify copying cop = quantify_copying.corpus(corpus) # Combine all metrics we've gathered about this corpus comp = { **comp, **unicomp, **cop } #dict(comp.items()+unicomp.items()+cop.items()) # merge the dicts comp["n.unique.bigrams"] = len(generated_corpus_bigrams.keys()) comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys()) if True: comp["randomisation"] = i corpus_stats = pd.DataFrame(comp, index=[i]) # Write the shuffled corpora to file # Now also shuffle the word list, so that we don't always have the same lengths in the train and test # corpus after the following split. #random.shuffle(corpus) # Split into train and test corpus #train_corpus = corpus[:N_TRAIN] #test_corpus = corpus[N_TRAIN:] # Write the shuffled corpora to file f = open('%s/bigramfree_%05i_corpus.txt' % (OUTPUT_DIR, i + 1), 'w') corpus = "\n".join([" ".join(w) for w in corpus]) f.write(corpus) f.close() #f = open('%s/bigramfree_%05i_test_corpus.txt'%(OUTPUT_DIR,i+1),'w') #corpus = "\n".join([ " ".join(w) for w in test_corpus ]) #f.write(corpus) #f.close() return corpus_stats return None
# The size of the train corpus (in number of sentences) #N_TRAIN = 184 # Read the input file and obtain a list of list of strings, i.e. the list of sentences f = open(INPUT_FILE, 'r') lines = f.readlines() f.close() cath8 = [l.strip().split(" ") for l in lines] N_WORDS_IN_CORPUS = sum([len(s) for s in cath8]) maxlength = 999999 # just don't put any realistic limit # Get a list of the bigrams, but omitting those involving word boundaries. bigrams_unpermuted = compare_bigrams.bigram_counts(cath8) unigrams_unpermuted = compare_bigrams.unigram_counts(cath8) # Ok, now we "flatten" the list: make all bigrams equally likely for bigr in bigrams_unpermuted: bigrams_unpermuted[bigr] = 1 def generate_bigram_corpus(i): print(i, end=" ", flush=True) # First make a candidate corpus corpus = [] total_length = 0 while total_length < N_WORDS_IN_CORPUS: # we obtain a sentence of a particular length
def generate_bigram_corpus(i): # This generates one bigram corpus (randomisation i) # outputs it to a file, and returns the corpus stats. print (i,) sys.stdout.flush() # First make a candidate corpus corpus = [] for l in lengths: # we obtain a sentence of a particular length # Choose a sentence of that length from the randomly generated sentences #sent = random.choice(supercorpus[l]) sent = compare_bigrams.weighted_choice(supercorpus[l]) # Sent will be encoded as a.b.c.d, so now we turn it back into a list, and add it to the corpus: corpus.append(sent.split(".")) # So now we should have a corpus of the same number of words, # the same distribution of word lengths as those of the original corpus. # What we need to check is whether the distribution of unigrams and bigrams # is approximately correct. Let's see. # As a metric we use the Cramer V metric. generated_corpus_bigrams = compare_bigrams.bigram_counts ( corpus ) generated_corpus_unigrams = compare_bigrams.unigram_counts( corpus ) comp = compare_bigrams.compare_Ngrams( bigrams_unpermuted, generated_corpus_bigrams ) comp = dict([ ("bigrams.%s"%k,v) for (k,v) in comp.items() ]) unicomp = compare_bigrams.compare_Ngrams( unigrams_unpermuted, generated_corpus_unigrams ) unicomp = dict([ ("unigrams.%s"%k,v) for (k,v) in unicomp.items() ]) comp["n.unique.bigrams"] = len(generated_corpus_bigrams.keys()) comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys()) # Also quantify copying cop = quantify_copying.corpus(corpus) # Combine all metrics we've gathered about this corpus comp = {**comp,**unicomp,**cop} # merge the dicts # Hope this happens reasonably often if True: comp["permutation"]=i stats = pd.DataFrame(comp,index=[i]) # Write the shuffled corpora to file # Now also shuffle the word list, so that we don't always have the same lengths in the train and test # corpus after the following split. #random.shuffle(corpus) # Split into train and test corpus #train_corpus = corpus[:N_TRAIN] #test_corpus = corpus[N_TRAIN:] # Write the shuffled corpora to file f = open('%s/permutation_%05i.txt'%(OUTPUT_DIR,i),'w') corpus = "\n".join([ " ".join(w) for w in corpus ]) f.write(corpus) f.close() #f = open('%s/bigramgen_%05i_test_corpus.txt'%(OUTPUT_DIR,i+1),'w') #corpus = "\n".join([ " ".join(w) for w in test_corpus ]) #f.write(corpus) #f.close() return stats return None
MAX_UNIGRAM_CRAMER_V = 1.0 MAX_BIGRAM_CRAMER_V = 1.0 # Read the input file and obtain a list of list of strings, i.e. the list of sentences f = open(INPUT_FILE,'r') lines = f.readlines() f.close() sentences = [ l.strip().split(" ") for l in lines ] # Get a list of the bigrams, but omitting those involving word boundaries. bigrams_unpermuted = compare_bigrams.bigram_counts( sentences ) unigrams_unpermuted = compare_bigrams.unigram_counts( sentences ) # Determine the length of each sentence lengths = [ len(s) for s in sentences ] maxlength = max(lengths) # the maximum sentence length print ("Generating the super corpus...") # Generate a whole huge lot of sentences from them, storing them all in a # gigantuous dict, where the keys are the sentence lengths. supercorpus = compare_bigrams.generate_words_from_bigrams( bigrams_unpermuted, N_SUPERCORPUS_SENTENCES, maxlength)