Beispiel #1
0
pip install nltk==3.3
pip install spacy==2.0.11
python -m spacy download en_core_web_sm
"""

import random
import nltk
nltk.download("wordnet") # Downloading the wordnet corpus
from nltk.corpus import wordnet as wn

import spacy
from spacy.lang.en import English

# Making sure that the versions are exactly the same
assert nltk.__version__ == "3.3"
assert wn.get_version() == "3.0"
assert spacy.__version__ == "2.0.11"

def main():
    # loading the tokenizer
    spacy_nlp = spacy.load("en_core_web_sm")
    tokenizer = English().Defaults.create_tokenizer(spacy_nlp)
    
    # Obtaining all the synsets and splitting them into train, dev and test. 
    # Splitting along synsets (and not instances) is important to not taint the test data.
    all_synsets = list(wn.all_synsets())
    random.seed(742382)
    random.shuffle(all_synsets)

    # 0.8/0.1/0.1 train/dev/test split
    split_index_train_dev = int(len(all_synsets) * 0.8)
Beispiel #2
0
        try:
            nltk.download(token, quiet = True, raise_on_error = True)
        except ValueError:
            # Sometimes there are problems with the default index.xml URL. Then we will try this...
            from nltk.downloader import Downloader as NLTKDownloader
            d = NLTKDownloader("http://nltk.github.com/nltk_data/")
            d.download(token, quiet = True, raise_on_error = True)

# Use the Brown corpus for calculating information content (IC)
brown_ic = wn_ic.ic('ic-brown.dat')
IC_CORPUS, IC_MAX = brown_ic, {}
for key in IC_CORPUS:
    IC_MAX[key] = max(IC_CORPUS[key].values())

# This will hold the WordNet version
VERSION = wn.get_version() or "3.0"

#---------------------------------------------------------------------------------------------------

DIACRITICS = {
    "a": ("á", "ä", "â", "à", "å"),
    "e": ("é", "ë", "ê", "è"),
    "i": ("í", "ï", "î", "ì"),
    "o": ("ó", "ö", "ô", "ò", "ō", "ø"),
    "u": ("ú", "ü", "û", "ù", "ů"),
    "y": ("ý", "ÿ", "ý"),
    "s": ("š",),
    "c": ("ç", "č"),
    "n": ("ñ",),
    "z": ("ž",)
}
Beispiel #3
0
        try:
            nltk.download(token, quiet = True, raise_on_error = True)
        except ValueError:
            # Sometimes there are problems with the default index.xml URL. Then we will try this...
            from nltk.downloader import Downloader as NLTKDownloader
            d = NLTKDownloader("http://nltk.github.com/nltk_data/")
            d.download(token, quiet = True, raise_on_error = True)

# Use the Brown corpus for calculating information content (IC)
brown_ic = wn_ic.ic('ic-brown.dat')
IC_CORPUS, IC_MAX = brown_ic, {}
for key in IC_CORPUS:
    IC_MAX[key] = max(IC_CORPUS[key].values())

# This will hold the WordNet version
VERSION = wn.get_version() or "3.0"

#---------------------------------------------------------------------------------------------------

DIACRITICS = {
    "a": ("á", "ä", "â", "à", "å"),
    "e": ("é", "ë", "ê", "è"),
    "i": ("í", "ï", "î", "ì"),
    "o": ("ó", "ö", "ô", "ò", "ō", "ø"),
    "u": ("ú", "ü", "û", "ù", "ů"),
    "y": ("ý", "ÿ", "ý"),
    "s": ("š",),
    "c": ("ç", "č"),
    "n": ("ñ",),
    "z": ("ž",)
}
Beispiel #4
0
    df = find_synonyms(df)
    df = create_new_aspects_from_synonyms(df)
    df["aspect"] = flatten_column_lists(df["aspect"])
    df["opinion"] = flatten_column_lists(df["opinion"])

    df = reformat_output_file(df, 3)
    save_file(df, name + "_WORDNET_WSD")
    end = timer()
    logging.debug("Whole program: %.2f seconds" % (end - start))
    # wsd_pywsd_simple_lesk(df)
    # wsd_pywsd_adapted_lesk(df)
    # find_synonyms(df)

if __name__ == '__main__':
    logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
    logging.debug("Wordnet version: %s" % wn.get_version())
    logging.debug("Wordnet adjective: %s" % wn.ADJ)
    logging.debug("Wordnet verb: %s" % wn.VERB)
    logging.debug("Wordnet noun: %s" % wn.NOUN)
    logging.debug("Wordnet adverb: %s" % wn.ADV)

    argument = return_sys_arguments(sys.argv)
    if argument is None:
        print("You didn't give an argument")
    elif os.path.isdir(argument):
        files = read_folder_contents(argument)
        print("Gave a folder: %s, that has %s files." % (argument, str(len(files))))
        x = 0
        for f in files:
            x += 1
            df = open_file(argument + "/" + f, "pandas")
Beispiel #5
0
from nltk.corpus import wordnet as wn

try:
    wn.get_version()
except:
    import nltk

    nltk.download('wordnet')

import xml.etree.ElementTree as ET

import logging

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S')
import logging.config

logging.config.dictConfig({
    'version': 1,
    'disable_existing_loggers': True,
})


class SemcorReader:
    def read_sequences(self, in_file, limit=-1):
        root = ET.parse(in_file).getroot()
        for i, s in enumerate(root.findall('text/sentence')):
            if i == limit: break

            seq_tokens = []
Beispiel #6
0
from nltk.corpus import brown as brown_corpus
from nltk.corpus import wordnet


def die_nltk_data_error(corpus):
    sys.stderr.write("Missing nltk data (%s). User install_nltk_data.sh script\n" % corpus)
    sys.exit(1)

try:
    brown_corpus.words()
except LookupError:
    die_nltk_data_error('brown')

try:
    wordnet.get_version()
except LookupError:
    die_nltk_data_error('wordnet')

def extract_keywords(prompt):
    tokens = nltk.word_tokenize(prompt)
    # build the brown freq dist - slow!
    fd = nltk.FreqDist(brown_corpus.words())

    # decorate sort undecorate with freq
    tokens_with_freq = [(fd.freq(t), t) for t in tokens]
    for _, t in sorted(tokens_with_freq):
        print t


Beispiel #7
0
if __name__ == '__main__':

    similarity = wup
    T = 0.76
    results_prefix = 'all_kadist_works'
    file_trials = 'data/all_annotated_trials.json'  # annotated with tag_kadist_docs.py
    compute_person_metrics = False
    abbreviated = False
    abbreviated_size = 100

    cluster_types = ['clusters', 'superclusters']

    random.seed(42)

    print(' *', 'using WordNet version:', wordnet.get_version())
    print(' *', 'using', 'similarity fn', similarity.__name__, 'T', T)
    print(' *', 'compute_person_metrics', compute_person_metrics)
    print(' *', 'results_prefix', results_prefix)
    if abbreviated:
        cluster_types = ['clusters']
        print(
            ' *',
            'abbreviated mode, limiting to {} (stable sample) trials'.format(
                abbreviated_size))

    for cluster_type in cluster_types:
        file_clusters = f'data/{cluster_type}.json'
        with codecs.open(file_clusters, 'rb', 'utf-8') as f_clusters:
            clusters = preprocess_clusters(json.loads(f_clusters.read()))