예제 #1
0
    os.makedirs(OUTPUT_DIR)

# Number of permutations
N = 1000

# The size of the train corpus (in number of sentences)
#N_TRAIN = 184

# Read the input file and obtain a list of list of strings, i.e. the list of sentences
f = open(INPUT_FILE, 'r')
lines = f.readlines()
f.close()
sentences = [l.strip().split(" ") for l in lines]

# Get a list of the bigrams
unigrams_unpermuted = compare_bigrams.unigram_counts(sentences)
bigrams_unpermuted = compare_bigrams.bigram_counts(sentences)

# Determine the length of each sentence
lengths = [len(s) for s in sentences]

# Put all the words in a flat list
flat_sentences = []
for s in sentences:
    flat_sentences += s

print("The corpus has %i words, originally in %s sentences" %
      (len(flat_sentences), len(sentences)))

# First, put all the words in a line
예제 #2
0
    # The file that contains the base corpus
    INPUT_FILE = "../corpus/cath8.txt"
    # Each line should be a sentence, with the words separated by spaces

    # Read the input file and obtain a list of list of strings, i.e. the list of sentences
    f = open(INPUT_FILE, 'r')
    lines = f.readlines()
    f.close()
    sentences = [l.strip().split(" ") for l in lines]

    corpus_stats = pd.DataFrame()

    # Get a list of the bigrams, but omitting those involving word boundaries.
    bigrams_unpermuted = compare_bigrams.bigram_counts(sentences)
    unigrams_unpermuted = compare_bigrams.unigram_counts(sentences)

    for i in range(N_BOOTSTRAP_SAMPLES):

        print(i, end=" ", flush=True)

        # Take a bootstrap sample
        bootstrap_corpus = []
        for j in range(len(sentences)):
            bootstrap_corpus.append(random.choice(sentences))

        # So now we have a bootstrap resampled corpus with the same number
        # of sentences but not necessarily of the same lengths, nor necessarily
        # with the same unigram or bigram distributions.
        generated_corpus_bigrams = compare_bigrams.bigram_counts(
            bootstrap_corpus)
def generate_bigram_corpus(i):

    print(i, end=" ", flush=True)

    # First make a candidate corpus
    corpus = []
    total_length = 0
    while total_length < N_WORDS_IN_CORPUS:  # we obtain a sentence of a particular length

        # Generate one item
        sent, _ = list(
            compare_bigrams.generate_words_from_bigrams(
                bigrams_unpermuted, 1, maxlength,
                progress_output=False).values())[0][0]
        sent = sent.split(".")
        corpus += [sent]
        total_length += len(sent)

    # Ok, so now we have a corpus that is just slightly bigger in number of words than the target corpus.
    # We consider discarding the last sentence, and seeing if we end up closer to the target length.
    dlength = abs(total_length - N_WORDS_IN_CORPUS)
    dlength_discard = abs(
        (total_length - len(corpus[-1])) - N_WORDS_IN_CORPUS
    )  # the length of the corpus if we would discard the final addition

    if dlength_discard < dlength:  # if, discarding the last sentence, we are closer to the target length, do the discard
        corpus = corpus[:-1]

    total_length = sum([len(s) for s in corpus])

    generated_corpus_bigrams = compare_bigrams.bigram_counts(corpus)
    generated_corpus_unigrams = compare_bigrams.unigram_counts(corpus)

    comp = compare_bigrams.compare_Ngrams(bigrams_unpermuted,
                                          generated_corpus_bigrams)
    comp = dict([("bigrams.%s" % k, v) for (k, v) in comp.items()])

    unicomp = compare_bigrams.compare_Ngrams(unigrams_unpermuted,
                                             generated_corpus_unigrams)
    unicomp = dict([("unigrams.%s" % k, v) for (k, v) in unicomp.items()])

    # Also quantify copying
    cop = quantify_copying.corpus(corpus)

    # Combine all metrics we've gathered about this corpus
    comp = {
        **comp,
        **unicomp,
        **cop
    }  #dict(comp.items()+unicomp.items()+cop.items()) # merge the dicts
    comp["n.unique.bigrams"] = len(generated_corpus_bigrams.keys())
    comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys())

    if True:

        comp["randomisation"] = i
        corpus_stats = pd.DataFrame(comp, index=[i])

        # Write the shuffled corpora to file

        # Now also shuffle the word list, so that we don't always have the same lengths in the train and test
        # corpus after the following split.
        #random.shuffle(corpus)

        # Split into train and test corpus
        #train_corpus = corpus[:N_TRAIN]
        #test_corpus  = corpus[N_TRAIN:]

        # Write the shuffled corpora to file
        f = open('%s/bigramfree_%05i_corpus.txt' % (OUTPUT_DIR, i + 1), 'w')
        corpus = "\n".join([" ".join(w) for w in corpus])
        f.write(corpus)
        f.close()

        #f = open('%s/bigramfree_%05i_test_corpus.txt'%(OUTPUT_DIR,i+1),'w')
        #corpus = "\n".join([ " ".join(w) for w in test_corpus ])
        #f.write(corpus)
        #f.close()

        return corpus_stats
    return None
# The size of the train corpus (in number of sentences)
#N_TRAIN = 184

# Read the input file and obtain a list of list of strings, i.e. the list of sentences
f = open(INPUT_FILE, 'r')
lines = f.readlines()
f.close()
cath8 = [l.strip().split(" ") for l in lines]

N_WORDS_IN_CORPUS = sum([len(s) for s in cath8])

maxlength = 999999  # just don't put any realistic limit

# Get a list of the bigrams, but omitting those involving word boundaries.
bigrams_unpermuted = compare_bigrams.bigram_counts(cath8)
unigrams_unpermuted = compare_bigrams.unigram_counts(cath8)

# Ok, now we "flatten" the list: make all bigrams equally likely
for bigr in bigrams_unpermuted:
    bigrams_unpermuted[bigr] = 1


def generate_bigram_corpus(i):

    print(i, end=" ", flush=True)

    # First make a candidate corpus
    corpus = []
    total_length = 0
    while total_length < N_WORDS_IN_CORPUS:  # we obtain a sentence of a particular length
예제 #5
0
def generate_bigram_corpus(i):

    # This generates one bigram corpus (randomisation i)
    # outputs it to a file, and returns the corpus stats.

    print (i,)
    sys.stdout.flush()
    
    # First make a candidate corpus
    corpus = []
    for l in lengths: # we obtain a sentence of a particular length

        # Choose a sentence of that length from the randomly generated sentences
        #sent = random.choice(supercorpus[l])
        sent = compare_bigrams.weighted_choice(supercorpus[l])
        # Sent will be encoded as a.b.c.d, so now we turn it back into a list, and add it to the corpus:
        corpus.append(sent.split("."))


    # So now we should have a corpus of the same number of words,
    # the same distribution of word lengths as those of the original corpus.
    # What we need to check is whether the distribution of unigrams and bigrams
    # is approximately correct. Let's see.
    # As a metric we use the Cramer V metric.

    generated_corpus_bigrams  = compare_bigrams.bigram_counts ( corpus )
    generated_corpus_unigrams = compare_bigrams.unigram_counts( corpus )
    
    comp = compare_bigrams.compare_Ngrams( bigrams_unpermuted, generated_corpus_bigrams )
    comp = dict([ ("bigrams.%s"%k,v) for (k,v) in comp.items() ])

    unicomp = compare_bigrams.compare_Ngrams( unigrams_unpermuted, generated_corpus_unigrams )
    unicomp = dict([ ("unigrams.%s"%k,v) for (k,v) in unicomp.items() ])

    comp["n.unique.bigrams"]  = len(generated_corpus_bigrams.keys())
    comp["n.unique.unigrams"] = len(generated_corpus_unigrams.keys())

    # Also quantify copying
    cop = quantify_copying.corpus(corpus)
    
    # Combine all metrics we've gathered about this corpus
    comp = {**comp,**unicomp,**cop} # merge the dicts
    

    # Hope this happens reasonably often
    if True:

        comp["permutation"]=i

        stats = pd.DataFrame(comp,index=[i])
    
        # Write the shuffled corpora to file

        # Now also shuffle the word list, so that we don't always have the same lengths in the train and test
        # corpus after the following split.
        #random.shuffle(corpus)
        
        # Split into train and test corpus
        #train_corpus = corpus[:N_TRAIN]
        #test_corpus  = corpus[N_TRAIN:]

        # Write the shuffled corpora to file
        f = open('%s/permutation_%05i.txt'%(OUTPUT_DIR,i),'w')
        corpus = "\n".join([ " ".join(w) for w in corpus ])
        f.write(corpus)
        f.close()


        #f = open('%s/bigramgen_%05i_test_corpus.txt'%(OUTPUT_DIR,i+1),'w')
        #corpus = "\n".join([ " ".join(w) for w in test_corpus ])
        #f.write(corpus)
        #f.close()


        return stats

    return None
예제 #6
0
MAX_UNIGRAM_CRAMER_V = 1.0
MAX_BIGRAM_CRAMER_V = 1.0



# Read the input file and obtain a list of list of strings, i.e. the list of sentences
f = open(INPUT_FILE,'r')
lines = f.readlines()
f.close()
sentences = [ l.strip().split(" ") for l in lines ]



# Get a list of the bigrams, but omitting those involving word boundaries.
bigrams_unpermuted  = compare_bigrams.bigram_counts( sentences )
unigrams_unpermuted = compare_bigrams.unigram_counts( sentences )


# Determine the length of each sentence
lengths = [ len(s) for s in sentences ]
maxlength = max(lengths) # the maximum sentence length

print ("Generating the super corpus...")

# Generate a whole huge lot of sentences from them, storing them all in a
# gigantuous dict, where the keys are the sentence lengths.
supercorpus = compare_bigrams.generate_words_from_bigrams(
    bigrams_unpermuted,
    N_SUPERCORPUS_SENTENCES,
    maxlength)