def main(childes_xml_remote_root='http://childes.psy.cmu.edu/data-xml/', verbose=False):
    user_data_path = Downloader.default_download_dir(Downloader())
    childes_corpus_path = os.path.join(user_data_path, 'corpora/CHILDES/')

    try:
        os.makedirs(childes_corpus_path)
    except OSError:
        pass

    CHILDESDir(childes_xml_remote_root, childes_corpus_path).download(verbose)
Beispiel #2
0
def dl_nltk():
    TO_DL = ['stopwords', 'punkt']

    dler = Downloader('https://pastebin.com/raw/D3TBY4Mj')

    for to_dl in TO_DL:
        if not nltk.download(to_dl):
            print('Downloading NLTK data from alternative source...')
            if not dler.download(to_dl):
                print('Failed download NLTK data...')
Beispiel #3
0
    def __init__(self, ):
        """A method to initialize parameters"""

        DEFAULT_URL = 'https://raw.githubusercontent.com/JosephSefara/AfricanWordNet/master/data/index.xml'
        """The default URL for the NLTK data server's index"""

        try:
            downloader = Downloader(server_index_url=DEFAULT_URL)
            downloader.download('africanwordnet')
        except:
            raise
Beispiel #4
0
def nltk_updater():
    packages = Set(['brown',
                    'wordnet',
                    'wordnet_ic',
                    'maxent_treebank_pos_tagger',
                    'universal_tagset'])
    d = Downloader()
    for dirpath, dirnames, filenames in os.walk(d.default_download_dir()):
        module = os.path.basename(dirpath)
        if module in packages:
            packages.remove(module)
    for module in packages:
        print 'Missing', module
        download(module)
Beispiel #5
0
    def build_list_from_nltk(self, lang):
        downloader = Downloader()

        # Check if NLTK data directory exists.
        if StopwordRemover.nltk_dir == None:
            # Create temporary directory for download
            StopwordRemover.nltk_dir = tempfile.mkdtemp(prefix='cherami')
            nltk.data.path = [StopwordRemover.nltk_dir]
            
            logger.info('NLTK data directory is "{0}"'
                .format(StopwordRemover.nltk_dir))
        
        # Check if the NLTK data has already been downloaded.
        if not downloader.is_installed('stopwords'):
            logger.info('Downloading NLTK stopword data...')
            downloader.download('stopwords', StopwordRemover.nltk_dir, True)
            logger.info('NLTK stopword data downloaded.')

        for word in stopwords.words(lang):
            self.stopword_list.add(word)
Beispiel #6
0
    def __init__(self):
        self.logger = logging.getLogger('vert')
        super(WordMoversDistance, self).__init__()

        self.logger.info("Loading Word2Vec embeddings.")
        if not Downloader().is_installed('stopwords'):
            download('stopwords')
        self.stopwords = stopwords.words('english')
        self.model = KeyedVectors.load_word2vec_format(
            "./data/GoogleNews-vectors-negative300.bin",  #.gz takes 2.5x longer
            binary=True)
        self.logger.info("Done: loading Word2Vec embeddings.")
Beispiel #7
0
def install_nltk(download_dir=None):
    """ Download specific collection identifiers """
    if not download_dir:
        download_dir = settings.NLTK_DATA_PATH
    downloader = Downloader(download_dir=download_dir)
    downloader.download('punkt')
    downloader.download('maxent_treebank_pos_tagger')
Beispiel #8
0
    def build_list_from_nltk(self, lang):
        downloader = Downloader()
        tempdir = None
        
        # Check if the NLTK data has already been downloaded.
        if not downloader.is_installed('stopwords'):
            # Create temporary directory for download
            tempdir = tempfile.mkdtemp(prefix='cherami')
            logger.info('Downloading NLTK stopword data into "{0}"'
                '...'.format(tempdir))

            downloader.download('stopwords', tempdir, True)
            logger.info('NLTK stopword data downloaded.')

            nltk.data.path = [tempdir]

        for word in stopwords.words(lang):
            self.stopword_list.add(word)

        # Clean up after we're done.
        if tempdir is not None:
            shutil.rmtree(tempdir)
Beispiel #9
0
    def __init__(self):
        # check for stopwords installation
        if not Downloader().is_installed('stopwords'):
            download('stopwords')

        if config['wmd']['save_memory']:
            # 25% slower but 50% memory savings by reducing datatype size.
            self.model = KeyedVectors.load_word2vec_format(
                config['wmd']['word2vec'], binary=True, datatype=np.float16)
        else:
            self.model = KeyedVectors.load_word2vec_format(
                config['wmd']['word2vec'], binary=True)
        if config['wmd']['normalize']:
            # computes L2-norms of word weight vectors
            self.model.init_sims(replace=True)

        self.stopwords = stopwords.words('english')
Beispiel #10
0
 def __init__(self):
     super(RssSkill, self).__init__('RssSkill')
     self._is_reading_headlines = False
     self.feeds = {}
     self.cached_items = {}
     self.cache_time = {}
     try:
         pos_tag('advance')
     except LookupError:
         logger.debug('Tagger not installed... Trying to download')
         dler = Downloader()
         if not dler.download('averaged_perceptron_tagger'):
             logger.debug('Trying alternative source...')
             dler = Downloader(ALT_NLTK_DATA)
             dler.download('averaged_perceptron_tagger',
                           raise_on_error=True)
Beispiel #11
0
def nltk_download_corpus(
    resource_path,
    local_data=LOCAL_DATA,
    nltk_dir=NLTK_DIR,
):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    print(os.getcwd())
    from nltk.data import find
    if not (local_data):
        from nltk import download
    else:
        from nltk.downloader import Downloader
        _downloader = Downloader(local_data=local_data, nltk_dir=nltk_dir)
        #print(os.listdir(_downloader._download_dir)) DEBUG, don't throw away
        download = _downloader.download

    from os.path import split, sep
    from zipfile import BadZipfile

    # Download the NLTK data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    if not resource_path.endswith(sep):
        resource_path = resource_path + sep

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True
    except BadZipfile:
        raise BadZipfile(
            'The NLTK corpus file being opened is not a zipfile, '
            'or it has been corrupted and needs to be manually deleted.')

    return downloaded
Beispiel #12
0
def setup_nltk_on_lambda():
    """
    Getting nltk to run on lambda is tricky.

    We need to pregenerate the nltk libraries we are going to use
    and make them available to lambda
    """
    # we have to force the home variable in order to download
    # nltk files to the correct place for packaging
    os.environ['HOME'] = __here__
    from nltk.downloader import Downloader
    import nltk

    # deployed version of nltk data
    nltk.data.path = [os.path.join(__here__, 'nltk_data')] + nltk.data.path

    log.info("NLTK Path: %s", nltk.data.path)
    log.info("Default NLTK dir: %s", Downloader().default_download_dir())

    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('wordnet')
Beispiel #13
0
def _get_duc_sentences_2004():
    """
    Create a DucArticle for each article in the docs folder of Duc2004.
    Complete fields 'ID', 'folder', and 'sentence'.
    Returns:
        list<DucArticle>
    """
    if not Downloader().is_installed('punkt'):
        download('punkt')

    filenames = list()
    for root, _, files in os.walk(config["duc4_sentences_folder"],
                                  topdown=False):
        for name in files:
            filenames.append(os.path.join(root, name))

    articles = list()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    for filename in filenames:
        with open(filename, 'r') as f:
            parsed_html = BeautifulSoup(f.read(), "lxml")
            corpus = parsed_html.find_all('text')[0].string
            tokenized = tokenizer.tokenize(corpus)
            if tokenized[0].split()[-1] not in config["duc_ending_exceptions"]:
                sentence = tokenized[0].encode('ascii', 'ignore')
            else:
                sentence = (tokenized[0] + ' ' + tokenized[1]).encode(
                    'ascii', 'ignore')

            article = DucArticle()
            article.id = parsed_html.docno.string.rstrip().lstrip().replace(
                '\n', ' ').encode('ascii', 'ignore')
            article.folder = filename.lstrip(
                config["duc4_sentences_folder"])[:5]
            article.sentence = _tokenize_sentence_generic(sentence)
            articles.append(article)

    return articles
Beispiel #14
0
import os, nltk
from nltk.downloader import Downloader
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag, map_tag

nltk_packages = [
	'punkt',
	'maxent_treebank_pos_tagger',
	'universal_tagset',
	'wordnet'
]
nltk_path = os.path.dirname(os.path.realpath(__file__)) + '/nltk'
nltk.data.path.append(nltk_path)
nltk_dl = Downloader(download_dir = nltk_path)
for package in nltk_packages:
	nltk_dl.download(package)

primary_tags = set(['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON'])

processes = [
	'parts_all',
	'tokens_adj',
	'tokens_adv',
	'tokens_all',
	'tokens_dense',
	'tokens_noun',
	'tokens_other',
	'tokens_pron',
	'tokens_verb'
	]
import re
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import nltk
from nltk.downloader import Downloader
from nltk.probability import FreqDist
from nltk.probability import FreqDist
from nltk.corpus import stopwords as stopwords_corpus
from nltk import pos_tag
from nltk.util import ngrams
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

from sklearn.metrics import f1_score

downloader = Downloader()
downloader.download("stopwords")
nltk.download('averaged_perceptron_tagger')

with open("../data/full.csv", "r") as csvfile:
    reader = csv.reader(csvfile, quoting=csv.QUOTE_ALL)
    documents = [t[0] for t in reader]

matplotlib.rcParams.update({'font.size': 22})
stop_words = set(stopwords_corpus.words("english"))

_non_alpha = re.compile("[^a-zA-Z ]")


def normalize(text):
    """Map a token to a canonical form, e.g. lower case it, remove non-alpha characters, etc. 
Beispiel #16
0
import os

from nltk.downloader import Downloader
from childespy import CHILDESDir

childes_xml_remote_root = 'http://childes.psy.cmu.edu/data-xml/'

user_data_path = Downloader.default_download_dir(Downloader())
childes_corpus_path = os.path.join(user_data_path, 'corpora/CHILDES/')

try:
    os.makedirs(childes_corpus_path)
except OSError:
    pass

CHILDESDir(childes_xml_remote_root, childes_corpus_path).download()
import os

from nltk.downloader import Downloader
from childespy import CHILDESDir

childes_xml_remote_root='http://childes.psy.cmu.edu/data-xml/'

user_data_path = Downloader.default_download_dir(Downloader())
childes_corpus_path = os.path.join(user_data_path, 'corpora/CHILDES/')

try:
    os.makedirs(childes_corpus_path)
except OSError:
    pass

CHILDESDir(childes_xml_remote_root, childes_corpus_path).download()
Beispiel #18
0
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet_ic as wn_ic
from nltk.corpus.reader.wordnet import Synset as WordNetSynset

# Make sure the necessary corpora are downloaded to the local drive
for token in ("wordnet", "wordnet_ic", "sentiwordnet"):
    try:
        nltk.data.find("corpora/" + token)
    except LookupError:
        try:
            nltk.download(token, quiet = True, raise_on_error = True)
        except ValueError:
            # Sometimes there are problems with the default index.xml URL. Then we will try this...
            from nltk.downloader import Downloader as NLTKDownloader
            d = NLTKDownloader("http://nltk.github.com/nltk_data/")
            d.download(token, quiet = True, raise_on_error = True)

# Use the Brown corpus for calculating information content (IC)
brown_ic = wn_ic.ic('ic-brown.dat')
IC_CORPUS, IC_MAX = brown_ic, {}
for key in IC_CORPUS:
    IC_MAX[key] = max(IC_CORPUS[key].values())

# This will hold the WordNet version
VERSION = wn.get_version() or "3.0"

#---------------------------------------------------------------------------------------------------

DIACRITICS = {
    "a": ("á", "ä", "â", "à", "å"),
Beispiel #19
0
 def __init__(self, data_id=default_data_id):
     Downloader.__init__(self)
     self.data_id = data_id
Beispiel #20
0
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet_ic as wn_ic
from nltk.corpus.reader.wordnet import Synset as WordNetSynset

# Make sure the necessary corpora are downloaded to the local drive
for token in ("wordnet", "wordnet_ic", "sentiwordnet"):
    try:
        nltk.data.find("corpora/" + token)
    except LookupError:
        try:
            nltk.download(token, quiet = True, raise_on_error = True)
        except ValueError:
            # Sometimes there are problems with the default index.xml URL. Then we will try this...
            from nltk.downloader import Downloader as NLTKDownloader
            d = NLTKDownloader("http://nltk.github.com/nltk_data/")
            d.download(token, quiet = True, raise_on_error = True)

# Use the Brown corpus for calculating information content (IC)
brown_ic = wn_ic.ic('ic-brown.dat')
IC_CORPUS, IC_MAX = brown_ic, {}
for key in IC_CORPUS:
    IC_MAX[key] = max(IC_CORPUS[key].values())

# This will hold the WordNet version
VERSION = wn.get_version() or "3.0"

#---------------------------------------------------------------------------------------------------

DIACRITICS = {
    "a": ("á", "ä", "â", "à", "å"),
Beispiel #21
0
from qanom.candidate_extraction import cand_utils
from qanom.annotations.common import read_csv
from qanom.candidate_extraction.verb_to_nom import SuffixBasedNominalizationCandidates as VTN
""" Define which resources should be used (by default) for filtering nouns as candidate nominalizations. """
default_resources = {
    "wordnet": True,
    "catvar": True,
    "affixes_heuristic": True
}

vtn = None  # init this global VTN object only if required

# by default, use nltk's default pos_tagger ('averaged_perceptron_tagger'):
tagger_package = 'averaged_perceptron_tagger'
nltk_downloader = Downloader()
if (not nltk_downloader.is_installed(tagger_package)):
    nltk.download(tagger_package)

pos_tag = nltk.pos_tag
"""
Alternatively, when extracting candidates for crowdsourcing QANom annotations through the qasrl-crowdsourcing project,
one should use the same POS model as inside qasrl-crowdsourcing for consistency. 
qasrl-crowdsourcing uses the CoreNLPParser model in Java. We will use here nltk's CoreNLPParser wrapper.
To run the CoreNLPParser model as a server on your machine (port 9000), 
pre-run the following command from the unzipped directory of the stanford-core-nlp project 
(see https://www.khalidalnajjar.com/setup-use-stanford-corenlp-server-python/ for instructions): 
```bash
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000
```
Then, replace the above two-lines in the python script with the following block: 
Beispiel #22
0
class OWPreprocess(OWWidget):

    name = 'Preprocess Text'
    description = 'Construct a text pre-processing pipeline.'
    icon = 'icons/TextPreprocess.svg'
    priority = 30

    inputs = [(Input.CORPUS, Corpus, 'set_data')]
    outputs = [(Output.PP_CORPUS, Corpus)]

    autocommit = settings.Setting(True)
    # Persistent data for each module is stored here.
    persistent_data_tokenizer = settings.Setting({})
    persistent_data_casing = settings.Setting({})
    persistent_data_stemmer = settings.Setting({})
    persistent_data_filter = settings.Setting({})
    preprocessors = []  # Pre-processing modules for the current run.

    UserAdviceMessages = [
        widget.Message(
            "Some preprocessing methods require data (like word relationships, stop words, "
            "punctuation rules etc.) from the NLTK package. This data, if you didn't have it "
            "already, was downloaded to: {}".format(
                Downloader().default_download_dir()), "nltk_data")
    ]

    def __init__(self, parent=None):
        super().__init__(parent)

        self.corpus = None

        # -- INFO --
        info_box = gui.widgetBox(self.controlArea, 'Info')
        self.controlArea.layout().addStretch()
        self.info_label = gui.label(info_box, self,
                                    'No input corpus detected.')
        # Commit checkbox and commit button.
        output_box = gui.widgetBox(self.controlArea, 'Output')
        auto_commit_box = gui.auto_commit(output_box,
                                          self,
                                          'autocommit',
                                          'Commit',
                                          box=False)
        auto_commit_box.setMinimumWidth(170)

        # -- PIPELINE --
        frame = QFrame()
        frame.setContentsMargins(0, 0, 0, 0)
        frame.setFrameStyle(QFrame.Box)
        frame.setStyleSheet('.QFrame { border: 1px solid #B3B3B3; }')
        frame_layout = QVBoxLayout()
        frame_layout.setContentsMargins(0, 0, 0, 0)
        frame_layout.setSpacing(0)
        frame.setLayout(frame_layout)
        # Load the previous states.
        persistent_data = [
            self.persistent_data_tokenizer,
            self.persistent_data_stemmer,
            self.persistent_data_casing,
            self.persistent_data_filter,
        ]
        for ModuleClass, ModuleData in zip(PREPROCESSOR_MODULES,
                                           persistent_data):
            pp_module_widget = ModuleClass(ModuleData)  # Create pp instance.
            self.preprocessors.append(pp_module_widget)
            pp_module_widget.change_signal.connect(self.settings_invalidated)
            pp_module_widget.error_signal.connect(self.display_message)

            frame_layout.addWidget(pp_module_widget)
        self.store_pipeline()  # Store the pipeline after loading it.

        frame_layout.addStretch()
        self.mainArea.layout().addWidget(frame)

        self.progress_bar = None  # Progress bar initialization.

    def set_data(self, data=None):
        self.corpus = data
        self.update_info()
        self.commit()

    def update_info(self):
        if self.corpus is not None:
            info = 'Document count: {}'.format(len(self.corpus))
        else:
            info = 'No input corpus detected.'
        self.info_label.setText(info)

    def commit(self):
        self.store_pipeline()  # Store the new pipeline.
        if self.corpus is not None:
            pp = self.assemble_preprocessor()
            if pp is not None:
                self.apply(pp)

    def apply(self, preprocessor):
        with self.progressBar(len(self.corpus) * 2) as progress_bar:
            self.progress_bar = progress_bar
            output = preprocessor(self.corpus)
        self.progress_bar = None
        self.send(Output.PP_CORPUS, output)

    def assemble_preprocessor(self):
        self.error(0, '')

        pp_settings = {
            # If disabled, this defaults to True, which is not what we want.
            'lowercase': False,
        }
        for pp in self.preprocessors:
            if pp.enabled:
                pp_settings.update(pp.get_pp_setting())
        pp_settings['callback'] = self.document_finished
        try:
            preprocessor = Preprocessor(**pp_settings)
        except Exception as e:
            self.error(0, str(e))
            return None
        return preprocessor

    def store_pipeline(self):
        for pp in self.preprocessors:
            if isinstance(pp, TokenizerModule):
                self.persistent_data_tokenizer = pp.export_data()
            elif isinstance(pp, CasingModule):
                self.persistent_data_casing = pp.export_data()
            elif isinstance(pp, TransformationModule):
                self.persistent_data_stemmer = pp.export_data()
            elif isinstance(pp, FilteringModule):
                self.persistent_data_filter = pp.export_data()

    def document_finished(self):
        if self.progress_bar is not None:
            self.progress_bar.advance()

    @Slot()
    def settings_invalidated(self):
        self.commit()

    @Slot(str)
    def display_message(self, message):
        self.error(0, message)