Ejemplo n.º 1
0
def compute_emb(pages_path_in, pages_path_out, vocab):

    wemb = pkl.load(open(prm.wordemb_path, 'rb'))
    dim_emb = wemb[wemb.keys()[0]].shape[0]
    W = 0.01 * np.random.randn(len(vocab), dim_emb).astype(np.float32)
    for word, pos in vocab.items():
        if word in wemb:
            W[pos,:] = wemb[word]

    f = h5py.File(pages_path_in, 'r')

    if prm.att_doc and prm.att_segment_type == 'sentence':
        nltk.download('punkt')
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    os.remove(pages_path_out) if os.path.exists(pages_path_out) else None

    # Save to HDF5
    fout = h5py.File(pages_path_out,'a')

    if prm.att_doc:
        shape = (f['text'].shape[0],prm.max_segs_doc,prm.dim_emb)
    else:
        shape=(f['text'].shape[0],prm.dim_emb)

    embs = fout.create_dataset('emb', shape=shape, dtype=np.float32)
    mask = fout.create_dataset('mask', shape=(f['text'].shape[0],), dtype=np.float32)

    i = 0
    for text in f['text']:
        st = time.time()

        if prm.att_doc:
            if prm.att_segment_type == 'section':
                segs = ['']
                for line in text.split('\n'):
                    if line.strip().startswith('==') and line.strip().endswith('=='):
                        segs.append('')
                    segs[-1] += line + '\n'
            elif prm.att_segment_type == 'sentence':
                segs = tokenizer.tokenize(text.decode('ascii', 'ignore'))
            else:
                raise ValueError('Not a valid value for the attention segment type (att_segment_type) parameter.')

            segs = segs[:prm.max_segs_doc]
            emb_ = utils.Word2Vec_encode(segs, wemb)
            embs[i,:len(emb_),:] = emb_
            mask[i] = len(emb_)
        else:
            bow0, bow1 = utils.BOW(wordpunct_tokenize(text.lower()), vocab)
            emb = (W[bow0] * bow1[:,None]).sum(0)
            embs[i,:] = emb
        i += 1
        #if i > 3000:
        #    break

        print 'processing article', i, 'time', time.time()-st

    f.close()
    fout.close()
Ejemplo n.º 2
0
    def _build_wordset(clazz, obscurity_limit):
        # I'm sorry this method is so disgusting.
        # It's all in the cause of fast loading in the main case.

        from nltk import FreqDist

        # Ensure corpora are loaded.
        try:
            from nltk.corpus import cmudict
            cmudict.entries()
        except LookupError:
            print "CMUDict corpus not found. Downloading..."
            from nltk import download
            download('cmudict')
            print "[Done]"
        if obscurity_limit is not None:
            from nltk.corpus import brown
            try:
                brown.words()
            except LookupError:
                print "Brown corpus not found. Downloading...",
                from nltk import download
                download('brown')
                print "[Done]"

        words = cmudict.entries()
        if obscurity_limit is not None:
            freqs = FreqDist([w.lower() for w in brown.words()])
            words = sorted(words,
                           key=lambda x: freqs[x[0].lower()],
                           reverse=True)
            return words[:obscurity_limit]
        else:
            return list(words)
Ejemplo n.º 3
0
def _download_nltk_data():
    """Install corpus data.
    """
    for directory, data in nltk_data.iteritems():
        for datum in data:
            if not exists(join(NLTK_DATA_DIR, directory, datum)):
                nltk.download(datum, download_dir=NLTK_DATA_DIR)
Ejemplo n.º 4
0
    def handle(self, *args, **options):
        if args is None or len(args) < 2:
            pages = Page.objects.all()
            for page in pages:
                self._log.info("Page #%s: %s" % (page.id, page.fb_page_name))
            raise CommandError('Invalid arguments. Expected: <page_id> <action>, where action might be: extract, tfidf, webidf')


        page_id = args[0]
        action = args[1]

        if page_id == 'setup':
            self._log.info("invoking nltk download")
            nltk.download()
            exit()

        self._log.info('AnalyticsCommand initializing.')

        self._log.info('Page-Id: %s' % page_id)
        page = Page.objects.get(id=page_id)

        if action == "extract":
            self.processPageExtract(page)
        elif action == "tfidf":
            self.processTfIdf(page)
        elif action == "webidf":
            self.processWebIdf(page)
        else:
            self._log.warn("Unknown action: %s" % action)

        self._log.info("All done for now.")
def installNLTKResources():

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/europarl_raw')
    except LookupError:
        nltk.download('europarl_raw')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/gutenberg')
    except LookupError:
        nltk.download('gutenberg')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
Ejemplo n.º 6
0
 def __init__(self, save_path=None, download=False, tokenizer="wordpunct_tokenize", *args, **kwargs):
     super().__init__(save_path=save_path)
     if download:
         nltk.download()
     self.tokenizer = getattr(nltk.tokenize, tokenizer, None)
     if not callable(self.tokenizer):
         raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))
Ejemplo n.º 7
0
def _post_install(dir):
    import site
    reload(site)

    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')
Ejemplo n.º 8
0
 def __init__(self):
     print("Please Install the brown-corpus and wordnet on your machine : ")
     nltk.download()
     self.pfile = open("pcent_plurals.txt","w")
     self.pfile.write("%s %s \n" % ("Plurals".ljust(20),"Percentages"))
     self.plural_dict = {}
     self.single_dict = {}
Ejemplo n.º 9
0
def search_for_all_strings(line, file_format):
    '''Search for all strings with NLTK'''
    result = []
    for regexp in Config.excluded_lines:
        for match in re.finditer(regexp, line):
            if match:
                return([])

    for regexp in Config.strings_patterns[file_format]:
        for match in re.finditer(regexp, line):
            if not match:
                continue
            group = match.group(1)
            if len(group) > 0 and not contains_forbidden_patterns(group):
                try:
                    tokens = nltk.word_tokenize(group)
                    if len(tokens) > 0:
                        for word in tokens:
                            morf = wn.morphy(word)
                            if morf and len(str(morf)) > 1:
                                if (output_format == "csv") | (group not in global_word_pull):
                                    result.append(group)
                                    global_word_pull.add(group)
                                break
                except:
                    print ("Unexpected error:{0}".format(sys.exc_info()))
                    traceback.print_tb(sys.exc_info()[2])
                    url = os.path.join(os.path.split(os.path.realpath(__file__))[0] + "/nltk_info.html")
                    print("See here for installation instructions:\n" + url)
                    webbrowser.open_new(url)

                    nltk.download()
                    sys.exit(2)

    return result
Ejemplo n.º 10
0
def main():

    nltk.download('stopwords')
    nltk.download('vader_lexicon')        
        
    print("\n================================================================================\n")
    print("---------------------------------- Platform Information ------------------------")
    print('machine: {}'.format(platform.machine()))
    print('node: {}'.format(platform.node()))    
    print('processor: {}'.format(platform.processor()))    
    print('release: {}'.format(platform.release()))
    print('system: {}'.format(platform.system()))    
    print('version: {}'.format(platform.version()))
    print('uname: {}'.format(platform.uname()))
    
    #mem = virtual_memory()
    #print('memory: {}'.format(mem.total))  # total physical memory available
    
    print('python_build: {}'.format(platform.python_build()))
    print('python_compiler: {}'.format(platform.python_compiler()))
    print('python_branch: {}'.format(platform.python_branch()))
    print('python_implementation: {}'.format(platform.python_implementation()))
    
    print('python_revision: {}'.format(platform.python_revision()))
    print('python_version: {}'.format(platform.python_version()))
    
    print("\n================================================================================\n")
Ejemplo n.º 11
0
def lemma_tokenize(paragraph):
    lmtzr = WordNetLemmatizer()
    try:
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
    except LookupError:
        nltk.download('wordnet')
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
    def __init__(self, ngram=False, use_idf=False):
        self.ngram = ngram
        self.use_idf = use_idf

        # Load WordNet synsets and download data if necessary
        try:
            wordnet_path = nltk.data.find("corpora/wordnet")
        except LookupError:
            nltk.download("wordnet")
            wordnet_path = nltk.data.find("corpora/wordnet")
        self.wn = wordnet.WordNetCorpusReader(wordnet_path)

        # Initialize the two types of n-gram generators
        pentagram_vectorizer = CountVectorizer(
            ngram_range=(1, 5), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list
        )
        unigram_vectorizer = CountVectorizer(
            ngram_range=(1, 1), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list
        )

        # Function for generating five-grams through unigrams
        self.pent_analyze = pentagram_vectorizer.build_analyzer()

        # Function for generating just unigrams
        self.uni_analyze = unigram_vectorizer.build_analyzer()

        # Load IDF scores
        self.IDF = self.get_idf_scores()
        self.counts = self.get_counts()
Ejemplo n.º 13
0
def main():
    import io

    with io.open(os.path.join(HERE, "README.rst"), "r") as readme:
        setup(
            name=app.__project__,
            version=app.__version__,
            description=app.__doc__,
            long_description=readme.read(),
            classifiers=app.__classifiers__,
            author=app.__author__,
            author_email=app.__author_email__,
            # url                  = app.__url__,
            license=[c.rsplit("::", 1)[1].strip() for c in app.__classifiers__ if c.startswith("License ::")][0],
            keywords=" ".join(app.__keywords__),
            packages=["mancify"],
            package_data={},
            include_package_data=True,
            platforms=app.__platforms__,
            install_requires=app.__requires__,
            extras_require=app.__extra_requires__,
            zip_safe=True,
            entry_points=app.__entry_points__,
            tests_require=["pytest-cov", "pytest", "mock"],
            cmdclass={"test": PyTest},
        )

    # Download the required NLTK packages automatically
    import nltk

    nltk.download("cmudict")
    nltk.download("maxent_treebank_pos_tagger")
Ejemplo n.º 14
0
 def run(self):
     _install.run(self)
     import nltk
     try:
         nltk.data.find('tokenizers/punkt')
     except LookupError:
         nltk.download('punkt')
Ejemplo n.º 15
0
def annotations_to_words(terms, dag, ipr_map, lower):
    """
    Converts a string of accesssions into a string of the corresponding english-text representations.
    """
    try:
        sws = stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
        sws = stopwords.words('english')

    if lower:
        sws = set([x.lower() for x in sws])
        case = string.lower
    else:
        sws = set([x.upper() for x in sws])
        case = string.upper

    go_terms = [t.upper() for t in terms if 'GO' in t.upper()]
    ipr_terms = [t.upper() for t in terms if t.upper() in ipr_map]

    go_descriptions = ' '.join([case(dag[t].name) for t in go_terms]).split(' ')
    ipr_descriptions = ' '.join([case(ipr_map[t]) for t in ipr_terms]).split(' ')

    go_descriptions = [x.translate(None, string.punctuation) for x in go_descriptions]
    ipr_descriptions = [x.translate(None, string.punctuation) for x in ipr_descriptions]

    go_descriptions = [x for x in go_descriptions if case(x) not in sws]
    ipr_descriptions = [x for x in ipr_descriptions if case(x) not in sws]

    line = ' '.join(go_descriptions + ipr_descriptions)
    return line
Ejemplo n.º 16
0
 def __init__(self, tokenizer: str = "wordpunct_tokenize", download: bool = False,
              *args, **kwargs):
     if download:
         nltk.download()
     self.tokenizer = getattr(nltk.tokenize, tokenizer, None)
     if not callable(self.tokenizer):
         raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))
Ejemplo n.º 17
0
def boostrap_nltk_data():
    nltk.data.path.append('./data/')
    nltkdata_exists = Path('./data/tokenizers/punkt/english.pickle')

    if not nltkdata_exists.exists():
        logging.info("Downloading NLTK Data")
        nltk.download('punkt', './data')
Ejemplo n.º 18
0
def _post_install():
    from importlib import reload
    import site
    reload(site)

    import nltk
    nltk.download('punkt')
Ejemplo n.º 19
0
 def __init__(self):
     try:
         from sacremoses import MosesDetokenizer
         self._detokenizer = MosesDetokenizer()
     except (ImportError, TypeError) as err:
         if isinstance(err, TypeError):
             warnings.warn('The instantiation of MosesDetokenizer in sacremoses is'
                           ' currently only supported in python3.'
                           ' Now try NLTKMosesDetokenizer using NLTK ...')
         else:
             warnings.warn('sacremoses is not installed. '
                           'To install sacremoses, use pip install -U sacremoses'
                           ' Now try NLTKMosesDetokenizer using NLTK ...')
         try:
             import nltk
             try:
                 nltk.data.find('perluniprops')
             except LookupError:
                 nltk.download('perluniprops')
             from nltk.tokenize.moses import MosesDetokenizer
             self._detokenizer = MosesDetokenizer()
         except ImportError:
             raise ImportError('NLTK is not installed. '
                               'You must install NLTK <= 3.2.5 in order to use the '
                               'NLTKMosesDetokenizer. You can refer to the official '
                               'installation guide in https://www.nltk.org/install.html .')
Ejemplo n.º 20
0
def nltk_corpus(corpus_name):
    corpus = getattr(nltk.corpus, corpus_name)
    try:
        corpus.ensure_loaded()
    except:
        nltk.download(corpus_name)
    return corpus
Ejemplo n.º 21
0
def generate(dictionary='/usr/share/dict/british-english', output='../../gb-us-synonyms.txt'):
    nltk.download('wordnet')
    with open(dictionary) as dict_file:
        with open(output, 'w') as output_file:
            for gb, us in gen_synonyms(dict_file):
                output_file.write(gb + ', ' + us + '\n')
                print(gb + ',', us)
Ejemplo n.º 22
0
def nltk_download_corpus(resource_path):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    from nltk.data import find
    from nltk import download
    from os.path import split

    # Download the wordnet data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    ## From http://www.nltk.org/api/nltk.html ##
    # When using find() to locate a directory contained in a zipfile,
    # the resource name must end with the forward slash character.
    # Otherwise, find() will not locate the directory.
    ####
    # Helps when resource_path=='sentiment/vader_lexicon''
    if not resource_path.endswith('/'):
        resource_path = resource_path + '/'

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True

    return downloaded
Ejemplo n.º 23
0
def morphy(doc):
    """Lemmatize tokens using morphy, WordNet's lemmatizer."""
    # XXX Results will be better if we do POS tagging first, but then we
    # need to map Penn Treebank tags to WordNet tags.
    nltk.download('wordnet', quiet=False)
    return map(nltk.WordNetLemmatizer().lemmatize,
               _tokenize_if_needed(fetch(doc)))
Ejemplo n.º 24
0
def download_dataset(data_name):
    if data_name == 'flowers':
        print('== Flowers dataset ==')
        flowers_dir = os.path.join(DATA_DIR, 'flowers')
        flowers_jpg_tgz = os.path.join(flowers_dir, '102flowers.tgz')
        make_sure_path_exists(flowers_dir)

        # the original google drive link at https://drive.google.com/file/d/0B0ywwgffWnLLcms2WWJQRFNSWXM/view
        # from https://github.com/reedscot/icml2016 is problematic to download automatically, so included
        # the text_c10 directory from that archive as a bzipped file in the repo
        captions_tbz = os.path.join(DATA_DIR, 'flowers_text_c10.tar.bz2')
        print('Extracting ' + captions_tbz)
        captions_tar = tarfile.open(captions_tbz, 'r:bz2')
        captions_tar.extractall(flowers_dir)

        flowers_url = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
        print('Downloading ' + flowers_jpg_tgz + ' from ' + flowers_url)
        urlretrieve(flowers_url, flowers_jpg_tgz,
                    reporthook=dl_progress_hook)
        print('Extracting ' + flowers_jpg_tgz)
        flowers_jpg_tar = tarfile.open(flowers_jpg_tgz, 'r:gz')
        flowers_jpg_tar.extractall(flowers_dir)  # archive contains jpg/ folder

    elif data_name == 'skipthoughts':
        print('== Skipthoughts models ==')
        SKIPTHOUGHTS_DIR = os.path.join(DATA_DIR, 'skipthoughts')
        SKIPTHOUGHTS_BASE_URL = 'http://www.cs.toronto.edu/~rkiros/models/'
        make_sure_path_exists(SKIPTHOUGHTS_DIR)

        # following https://github.com/ryankiros/skip-thoughts#getting-started
        skipthoughts_files = [
            'dictionary.txt', 'utable.npy', 'btable.npy', 'uni_skip.npz', 'uni_skip.npz.pkl', 'bi_skip.npz',
            'bi_skip.npz.pkl',
        ]
        for filename in skipthoughts_files:
            src_url = SKIPTHOUGHTS_BASE_URL + filename
            print('Downloading ' + src_url)
            urlretrieve(src_url, os.path.join(SKIPTHOUGHTS_DIR, filename),
                        reporthook=dl_progress_hook)

    elif data_name == 'nltk_punkt':
        import nltk
        print('== NLTK pre-trained Punkt tokenizer for English ==')
        nltk.download('punkt')

    elif data_name == 'pretrained_model':
        print('== Pretrained model ==')
        MODEL_DIR = os.path.join(DATA_DIR, 'Models')
        pretrained_model_filename = 'latest_model_flowers_temp.ckpt'
        src_url = 'https://bitbucket.org/paarth_neekhara/texttomimagemodel/raw/74a4bbaeee26fe31e148a54c4f495694680e2c31/' + pretrained_model_filename
        print('Downloading ' + src_url)
        urlretrieve(
            src_url,
            os.path.join(MODEL_DIR, pretrained_model_filename),
            reporthook=dl_progress_hook,
        )

    else:
        raise ValueError('Unknown dataset name: ' + data_name)
Ejemplo n.º 25
0
def _post_install():  
    # since nltk may have just been install
    # we need to update our PYTHONPATH
    import site
    reload(site)
    # Now we can import nltk
    import nltk
    nltk.download('stopwords')
Ejemplo n.º 26
0
 def test_notebook_runner_2a_eco_nlp_correction(self):
     fLOG(
         __file__,
         self._testMethodName,
         OutputPrint=__name__ == "__main__")
     import nltk
     nltk.download('stopwords')
     self.common_notebook_runner_2a_eco_nlp_enonce("correction")
Ejemplo n.º 27
0
 def download_packages(self):
     import nltk
     
     for x in [comp for comp in self._missing if "/" in comp]:
         package = x.split("/")[1]
         self.updateLabel.emit(package)
         nltk.download(package, raise_on_error=True)
         self.progressTheBar.emit()
Ejemplo n.º 28
0
 def setup_dependencies(self):
     if not self._nltk_data_downloaded and bool(
             self.section.get('shortlog_imperative_check', True)):
         nltk.download([
             'punkt',
             'averaged_perceptron_tagger',
         ])
         type(self)._nltk_data_downloaded = True
def main():
    nltk.download("punkt")
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("training_datasets", type=str, nargs="+",
                        help="path to training data sets")
    args = parser.parse_args(sys.argv[1:])

    run(args)
Ejemplo n.º 30
0
    def __init__(self):
        from nltk.data import find
        from nltk import download

        try:
            find('wordnet.zip')
        except LookupError:
            download('wordnet')
import numpy as np

import time

from textblob import TextBlob
import dill as pickle
# import pickle

from customTransfomers import *

t0 = time.time()
#-------------------------------------
#***********Setting NTLK**************
#-------------------------------------

nltk.download('punkt')
nltk.download('stopwords')

#-------------------------------------
#***********Reading Data**************
#-------------------------------------
print("Reading Data ......")
df = pd.read_csv('fakecorpusWithMeta.csv')

print("Cleaning the Data ......")
# Drop unused columns
df = df.drop(columns=["scraped_at", "index", "Unnamed: 0"])

print("Filtering Data ......")
# If title is missing drop the entry
df = df.dropna(subset=['title'])