Ejemplos de download en Python, ejemplos de nltk.download en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: convert2emb.py Proyecto: jxwuyi/WebNav

def compute_emb(pages_path_in, pages_path_out, vocab):

    wemb = pkl.load(open(prm.wordemb_path, 'rb'))
    dim_emb = wemb[wemb.keys()[0]].shape[0]
    W = 0.01 * np.random.randn(len(vocab), dim_emb).astype(np.float32)
    for word, pos in vocab.items():
        if word in wemb:
            W[pos,:] = wemb[word]

    f = h5py.File(pages_path_in, 'r')

    if prm.att_doc and prm.att_segment_type == 'sentence':
        nltk.download('punkt')
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    os.remove(pages_path_out) if os.path.exists(pages_path_out) else None

    # Save to HDF5
    fout = h5py.File(pages_path_out,'a')

    if prm.att_doc:
        shape = (f['text'].shape[0],prm.max_segs_doc,prm.dim_emb)
    else:
        shape=(f['text'].shape[0],prm.dim_emb)

    embs = fout.create_dataset('emb', shape=shape, dtype=np.float32)
    mask = fout.create_dataset('mask', shape=(f['text'].shape[0],), dtype=np.float32)

    i = 0
    for text in f['text']:
        st = time.time()

        if prm.att_doc:
            if prm.att_segment_type == 'section':
                segs = ['']
                for line in text.split('\n'):
                    if line.strip().startswith('==') and line.strip().endswith('=='):
                        segs.append('')
                    segs[-1] += line + '\n'
            elif prm.att_segment_type == 'sentence':
                segs = tokenizer.tokenize(text.decode('ascii', 'ignore'))
            else:
                raise ValueError('Not a valid value for the attention segment type (att_segment_type) parameter.')

            segs = segs[:prm.max_segs_doc]
            emb_ = utils.Word2Vec_encode(segs, wemb)
            embs[i,:len(emb_),:] = emb_
            mask[i] = len(emb_)
        else:
            bow0, bow1 = utils.BOW(wordpunct_tokenize(text.lower()), vocab)
            emb = (W[bow0] * bow1[:,None]).sum(0)
            embs[i,:] = emb
        i += 1
        #if i > 3000:
        #    break

        print 'processing article', i, 'time', time.time()-st

    f.close()
    fout.close()

Ejemplo n.º 2

0

Mostrar archivo

Archivo: rhymelib.py Proyecto: StefanKopieczek/pyverse

    def _build_wordset(clazz, obscurity_limit):
        # I'm sorry this method is so disgusting.
        # It's all in the cause of fast loading in the main case.

        from nltk import FreqDist

        # Ensure corpora are loaded.
        try:
            from nltk.corpus import cmudict
            cmudict.entries()
        except LookupError:
            print "CMUDict corpus not found. Downloading..."
            from nltk import download
            download('cmudict')
            print "[Done]"
        if obscurity_limit is not None:
            from nltk.corpus import brown
            try:
                brown.words()
            except LookupError:
                print "Brown corpus not found. Downloading...",
                from nltk import download
                download('brown')
                print "[Done]"

        words = cmudict.entries()
        if obscurity_limit is not None:
            freqs = FreqDist([w.lower() for w in brown.words()])
            words = sorted(words,
                           key=lambda x: freqs[x[0].lower()],
                           reverse=True)
            return words[:obscurity_limit]
        else:
            return list(words)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: wordnet.py Proyecto: ooda/vwordnet

def _download_nltk_data():
    """Install corpus data.
    """
    for directory, data in nltk_data.iteritems():
        for datum in data:
            if not exists(join(NLTK_DATA_DIR, directory, datum)):
                nltk.download(datum, download_dir=NLTK_DATA_DIR)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: analytics.py Proyecto: FrankGrimm/text-insights

    def handle(self, *args, **options):
        if args is None or len(args) < 2:
            pages = Page.objects.all()
            for page in pages:
                self._log.info("Page #%s: %s" % (page.id, page.fb_page_name))
            raise CommandError('Invalid arguments. Expected: <page_id> <action>, where action might be: extract, tfidf, webidf')


        page_id = args[0]
        action = args[1]

        if page_id == 'setup':
            self._log.info("invoking nltk download")
            nltk.download()
            exit()

        self._log.info('AnalyticsCommand initializing.')

        self._log.info('Page-Id: %s' % page_id)
        page = Page.objects.get(id=page_id)

        if action == "extract":
            self.processPageExtract(page)
        elif action == "tfidf":
            self.processTfIdf(page)
        elif action == "webidf":
            self.processWebIdf(page)
        else:
            self._log.warn("Unknown action: %s" % action)

        self._log.info("All done for now.")

Ejemplo n.º 5

0

Mostrar archivo

Archivo: InstallNLTKResources.py Proyecto: paradisepilot/statistics

def installNLTKResources():

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/europarl_raw')
    except LookupError:
        nltk.download('europarl_raw')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    try:
        nltk.data.find('corpora/gutenberg')
    except LookupError:
        nltk.download('gutenberg')

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )

Ejemplo n.º 6

0

Mostrar archivo

Archivo: nltk_tokenizer.py Proyecto: CuteCha/DeepPavlov

 def __init__(self, save_path=None, download=False, tokenizer="wordpunct_tokenize", *args, **kwargs):
     super().__init__(save_path=save_path)
     if download:
         nltk.download()
     self.tokenizer = getattr(nltk.tokenize, tokenizer, None)
     if not callable(self.tokenizer):
         raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))

Ejemplo n.º 7

0

Mostrar archivo

Archivo: setup.py Proyecto: tassieg/topic-explorer

def _post_install(dir):
    import site
    reload(site)

    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')

Ejemplo n.º 8

0

Mostrar archivo

Archivo: plural_Find.py Proyecto: knkumar/Plural_find

 def __init__(self):
     print("Please Install the brown-corpus and wordnet on your machine : ")
     nltk.download()
     self.pfile = open("pcent_plurals.txt","w")
     self.pfile.write("%s %s \n" % ("Plurals".ljust(20),"Percentages"))
     self.plural_dict = {}
     self.single_dict = {}

Ejemplo n.º 9

0

Mostrar archivo

Archivo: loc_finder.py Proyecto: alexsosn/OwlLocalizer

def search_for_all_strings(line, file_format):
    '''Search for all strings with NLTK'''
    result = []
    for regexp in Config.excluded_lines:
        for match in re.finditer(regexp, line):
            if match:
                return([])

    for regexp in Config.strings_patterns[file_format]:
        for match in re.finditer(regexp, line):
            if not match:
                continue
            group = match.group(1)
            if len(group) > 0 and not contains_forbidden_patterns(group):
                try:
                    tokens = nltk.word_tokenize(group)
                    if len(tokens) > 0:
                        for word in tokens:
                            morf = wn.morphy(word)
                            if morf and len(str(morf)) > 1:
                                if (output_format == "csv") | (group not in global_word_pull):
                                    result.append(group)
                                    global_word_pull.add(group)
                                break
                except:
                    print ("Unexpected error:{0}".format(sys.exc_info()))
                    traceback.print_tb(sys.exc_info()[2])
                    url = os.path.join(os.path.split(os.path.realpath(__file__))[0] + "/nltk_info.html")
                    print("See here for installation instructions:\n" + url)
                    webbrowser.open_new(url)

                    nltk.download()
                    sys.exit(2)

    return result

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test.py Proyecto: Brbrew/Docker

def main():

    nltk.download('stopwords')
    nltk.download('vader_lexicon')        
        
    print("\n================================================================================\n")
    print("---------------------------------- Platform Information ------------------------")
    print('machine: {}'.format(platform.machine()))
    print('node: {}'.format(platform.node()))    
    print('processor: {}'.format(platform.processor()))    
    print('release: {}'.format(platform.release()))
    print('system: {}'.format(platform.system()))    
    print('version: {}'.format(platform.version()))
    print('uname: {}'.format(platform.uname()))
    
    #mem = virtual_memory()
    #print('memory: {}'.format(mem.total))  # total physical memory available
    
    print('python_build: {}'.format(platform.python_build()))
    print('python_compiler: {}'.format(platform.python_compiler()))
    print('python_branch: {}'.format(platform.python_branch()))
    print('python_implementation: {}'.format(platform.python_implementation()))
    
    print('python_revision: {}'.format(platform.python_revision()))
    print('python_version: {}'.format(platform.python_version()))
    
    print("\n================================================================================\n")

Ejemplo n.º 11

0

Mostrar archivo

Archivo: text_nltk.py Proyecto: dirkneumann/nuanceq

def lemma_tokenize(paragraph):
    lmtzr = WordNetLemmatizer()
    try:
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
    except LookupError:
        nltk.download('wordnet')
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]

Ejemplo n.º 12

0

Mostrar archivo

Archivo: semantic_predictability.py Proyecto: mmcauliffe/linguistic-helper-functions

    def __init__(self, ngram=False, use_idf=False):
        self.ngram = ngram
        self.use_idf = use_idf

        # Load WordNet synsets and download data if necessary
        try:
            wordnet_path = nltk.data.find("corpora/wordnet")
        except LookupError:
            nltk.download("wordnet")
            wordnet_path = nltk.data.find("corpora/wordnet")
        self.wn = wordnet.WordNetCorpusReader(wordnet_path)

        # Initialize the two types of n-gram generators
        pentagram_vectorizer = CountVectorizer(
            ngram_range=(1, 5), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list
        )
        unigram_vectorizer = CountVectorizer(
            ngram_range=(1, 1), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list
        )

        # Function for generating five-grams through unigrams
        self.pent_analyze = pentagram_vectorizer.build_analyzer()

        # Function for generating just unigrams
        self.uni_analyze = unigram_vectorizer.build_analyzer()

        # Load IDF scores
        self.IDF = self.get_idf_scores()
        self.counts = self.get_counts()

Ejemplo n.º 13

0

Mostrar archivo

Archivo: setup.py Proyecto: jvlomax/mancify

def main():
    import io

    with io.open(os.path.join(HERE, "README.rst"), "r") as readme:
        setup(
            name=app.__project__,
            version=app.__version__,
            description=app.__doc__,
            long_description=readme.read(),
            classifiers=app.__classifiers__,
            author=app.__author__,
            author_email=app.__author_email__,
            # url                  = app.__url__,
            license=[c.rsplit("::", 1)[1].strip() for c in app.__classifiers__ if c.startswith("License ::")][0],
            keywords=" ".join(app.__keywords__),
            packages=["mancify"],
            package_data={},
            include_package_data=True,
            platforms=app.__platforms__,
            install_requires=app.__requires__,
            extras_require=app.__extra_requires__,
            zip_safe=True,
            entry_points=app.__entry_points__,
            tests_require=["pytest-cov", "pytest", "mock"],
            cmdclass={"test": PyTest},
        )

    # Download the required NLTK packages automatically
    import nltk

    nltk.download("cmudict")
    nltk.download("maxent_treebank_pos_tagger")

Ejemplo n.º 14

0

Mostrar archivo

Archivo: setup.py Proyecto: sovaa/neuralnet

 def run(self):
     _install.run(self)
     import nltk
     try:
         nltk.data.find('tokenizers/punkt')
     except LookupError:
         nltk.download('punkt')

Ejemplo n.º 15

0

Mostrar archivo

Archivo: preprocess.py Proyecto: daniaki/ppi_wrangler

def annotations_to_words(terms, dag, ipr_map, lower):
    """
    Converts a string of accesssions into a string of the corresponding english-text representations.
    """
    try:
        sws = stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
        sws = stopwords.words('english')

    if lower:
        sws = set([x.lower() for x in sws])
        case = string.lower
    else:
        sws = set([x.upper() for x in sws])
        case = string.upper

    go_terms = [t.upper() for t in terms if 'GO' in t.upper()]
    ipr_terms = [t.upper() for t in terms if t.upper() in ipr_map]

    go_descriptions = ' '.join([case(dag[t].name) for t in go_terms]).split(' ')
    ipr_descriptions = ' '.join([case(ipr_map[t]) for t in ipr_terms]).split(' ')

    go_descriptions = [x.translate(None, string.punctuation) for x in go_descriptions]
    ipr_descriptions = [x.translate(None, string.punctuation) for x in ipr_descriptions]

    go_descriptions = [x for x in go_descriptions if case(x) not in sws]
    ipr_descriptions = [x for x in ipr_descriptions if case(x) not in sws]

    line = ' '.join(go_descriptions + ipr_descriptions)
    return line

Ejemplo n.º 16

0

Mostrar archivo

Archivo: nltk_tokenizer.py Proyecto: RileyShe/DeepPavlov

 def __init__(self, tokenizer: str = "wordpunct_tokenize", download: bool = False,
              *args, **kwargs):
     if download:
         nltk.download()
     self.tokenizer = getattr(nltk.tokenize, tokenizer, None)
     if not callable(self.tokenizer):
         raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))

Ejemplo n.º 17

0

Mostrar archivo

Archivo: ai.py Proyecto: jimmytheleaf/botutils

def boostrap_nltk_data():
    nltk.data.path.append('./data/')
    nltkdata_exists = Path('./data/tokenizers/punkt/english.pickle')

    if not nltkdata_exists.exists():
        logging.info("Downloading NLTK Data")
        nltk.download('punkt', './data')

Ejemplo n.º 18

0

Mostrar archivo

Archivo: setup.py Proyecto: alexlafroscia/class-projects

def _post_install():
    from importlib import reload
    import site
    reload(site)

    import nltk
    nltk.download('punkt')

Ejemplo n.º 19

0

Mostrar archivo

Archivo: transforms.py Proyecto: hridaydutta123/gluon-nlp

 def __init__(self):
     try:
         from sacremoses import MosesDetokenizer
         self._detokenizer = MosesDetokenizer()
     except (ImportError, TypeError) as err:
         if isinstance(err, TypeError):
             warnings.warn('The instantiation of MosesDetokenizer in sacremoses is'
                           ' currently only supported in python3.'
                           ' Now try NLTKMosesDetokenizer using NLTK ...')
         else:
             warnings.warn('sacremoses is not installed. '
                           'To install sacremoses, use pip install -U sacremoses'
                           ' Now try NLTKMosesDetokenizer using NLTK ...')
         try:
             import nltk
             try:
                 nltk.data.find('perluniprops')
             except LookupError:
                 nltk.download('perluniprops')
             from nltk.tokenize.moses import MosesDetokenizer
             self._detokenizer = MosesDetokenizer()
         except ImportError:
             raise ImportError('NLTK is not installed. '
                               'You must install NLTK <= 3.2.5 in order to use the '
                               'NLTKMosesDetokenizer. You can refer to the official '
                               'installation guide in https://www.nltk.org/install.html .')

Ejemplo n.º 20

0

Mostrar archivo

Archivo: datasets.py Proyecto: pramitchoudhary/Experiments

def nltk_corpus(corpus_name):
    corpus = getattr(nltk.corpus, corpus_name)
    try:
        corpus.ensure_loaded()
    except:
        nltk.download(corpus_name)
    return corpus

Ejemplo n.º 21

0

Mostrar archivo

Archivo: tasks.py Proyecto: 7digital/synonym-list

def generate(dictionary='/usr/share/dict/british-english', output='../../gb-us-synonyms.txt'):
    nltk.download('wordnet')
    with open(dictionary) as dict_file:
        with open(output, 'w') as output_file:
            for gb, us in gen_synonyms(dict_file):
                output_file.write(gb + ', ' + us + '\n')
                print(gb + ',', us)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: utils.py Proyecto: jianjun66/ChatterBot

def nltk_download_corpus(resource_path):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    from nltk.data import find
    from nltk import download
    from os.path import split

    # Download the wordnet data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    ## From http://www.nltk.org/api/nltk.html ##
    # When using find() to locate a directory contained in a zipfile,
    # the resource name must end with the forward slash character.
    # Otherwise, find() will not locate the directory.
    ####
    # Helps when resource_path=='sentiment/vader_lexicon''
    if not resource_path.endswith('/'):
        resource_path = resource_path + '/'

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True

    return downloaded

Ejemplo n.º 23

0

Mostrar archivo

Archivo: single.py Proyecto: IsaacHaze/xtas

def morphy(doc):
    """Lemmatize tokens using morphy, WordNet's lemmatizer."""
    # XXX Results will be better if we do POS tagging first, but then we
    # need to map Penn Treebank tags to WordNet tags.
    nltk.download('wordnet', quiet=False)
    return map(nltk.WordNetLemmatizer().lemmatize,
               _tokenize_if_needed(fetch(doc)))

Ejemplo n.º 24

0

Mostrar archivo

Archivo: download_datasets.py Proyecto: zhuwenxing/text-to-image

def download_dataset(data_name):
    if data_name == 'flowers':
        print('== Flowers dataset ==')
        flowers_dir = os.path.join(DATA_DIR, 'flowers')
        flowers_jpg_tgz = os.path.join(flowers_dir, '102flowers.tgz')
        make_sure_path_exists(flowers_dir)

        # the original google drive link at https://drive.google.com/file/d/0B0ywwgffWnLLcms2WWJQRFNSWXM/view
        # from https://github.com/reedscot/icml2016 is problematic to download automatically, so included
        # the text_c10 directory from that archive as a bzipped file in the repo
        captions_tbz = os.path.join(DATA_DIR, 'flowers_text_c10.tar.bz2')
        print('Extracting ' + captions_tbz)
        captions_tar = tarfile.open(captions_tbz, 'r:bz2')
        captions_tar.extractall(flowers_dir)

        flowers_url = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
        print('Downloading ' + flowers_jpg_tgz + ' from ' + flowers_url)
        urlretrieve(flowers_url, flowers_jpg_tgz,
                    reporthook=dl_progress_hook)
        print('Extracting ' + flowers_jpg_tgz)
        flowers_jpg_tar = tarfile.open(flowers_jpg_tgz, 'r:gz')
        flowers_jpg_tar.extractall(flowers_dir)  # archive contains jpg/ folder

    elif data_name == 'skipthoughts':
        print('== Skipthoughts models ==')
        SKIPTHOUGHTS_DIR = os.path.join(DATA_DIR, 'skipthoughts')
        SKIPTHOUGHTS_BASE_URL = 'http://www.cs.toronto.edu/~rkiros/models/'
        make_sure_path_exists(SKIPTHOUGHTS_DIR)

        # following https://github.com/ryankiros/skip-thoughts#getting-started
        skipthoughts_files = [
            'dictionary.txt', 'utable.npy', 'btable.npy', 'uni_skip.npz', 'uni_skip.npz.pkl', 'bi_skip.npz',
            'bi_skip.npz.pkl',
        ]
        for filename in skipthoughts_files:
            src_url = SKIPTHOUGHTS_BASE_URL + filename
            print('Downloading ' + src_url)
            urlretrieve(src_url, os.path.join(SKIPTHOUGHTS_DIR, filename),
                        reporthook=dl_progress_hook)

    elif data_name == 'nltk_punkt':
        import nltk
        print('== NLTK pre-trained Punkt tokenizer for English ==')
        nltk.download('punkt')

    elif data_name == 'pretrained_model':
        print('== Pretrained model ==')
        MODEL_DIR = os.path.join(DATA_DIR, 'Models')
        pretrained_model_filename = 'latest_model_flowers_temp.ckpt'
        src_url = 'https://bitbucket.org/paarth_neekhara/texttomimagemodel/raw/74a4bbaeee26fe31e148a54c4f495694680e2c31/' + pretrained_model_filename
        print('Downloading ' + src_url)
        urlretrieve(
            src_url,
            os.path.join(MODEL_DIR, pretrained_model_filename),
            reporthook=dl_progress_hook,
        )

    else:
        raise ValueError('Unknown dataset name: ' + data_name)

Ejemplo n.º 25

0

Mostrar archivo

Archivo: setup.py Proyecto: jkeung/Customer_Recommender

def _post_install():  
    # since nltk may have just been install
    # we need to update our PYTHONPATH
    import site
    reload(site)
    # Now we can import nltk
    import nltk
    nltk.download('stopwords')

Ejemplo n.º 26

0

Mostrar archivo

 def test_notebook_runner_2a_eco_nlp_correction(self):
     fLOG(
         __file__,
         self._testMethodName,
         OutputPrint=__name__ == "__main__")
     import nltk
     nltk.download('stopwords')
     self.common_notebook_runner_2a_eco_nlp_enonce("correction")

Ejemplo n.º 27

0

Mostrar archivo

Archivo: nltkdatafiles.py Proyecto: gkunter/coquery

 def download_packages(self):
     import nltk
     
     for x in [comp for comp in self._missing if "/" in comp]:
         package = x.split("/")[1]
         self.updateLabel.emit(package)
         nltk.download(package, raise_on_error=True)
         self.progressTheBar.emit()

Ejemplo n.º 28

0

Mostrar archivo

Archivo: GitCommitBear.py Proyecto: srisankethu/coala-bears

 def setup_dependencies(self):
     if not self._nltk_data_downloaded and bool(
             self.section.get('shortlog_imperative_check', True)):
         nltk.download([
             'punkt',
             'averaged_perceptron_tagger',
         ])
         type(self)._nltk_data_downloaded = True

Ejemplo n.º 29

0

Mostrar archivo

Archivo: sentiment-analyzer.py Proyecto: alexandretea/sentiment-analysis

def main():
    nltk.download("punkt")
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("training_datasets", type=str, nargs="+",
                        help="path to training data sets")
    args = parser.parse_args(sys.argv[1:])

    run(args)

Ejemplo n.º 30

0

Mostrar archivo

Archivo: word_net.py Proyecto: fmoliveira/ChatterBot

    def __init__(self):
        from nltk.data import find
        from nltk import download

        try:
            find('wordnet.zip')
        except LookupError:
            download('wordnet')

Ejemplo n.º 31

0

Mostrar archivo

Archivo: eval_2datasets_pipeline.py Proyecto: mashres15/FakeNewsCapstone

import numpy as np

import time

from textblob import TextBlob
import dill as pickle
# import pickle

from customTransfomers import *

t0 = time.time()
#-------------------------------------
#***********Setting NTLK**************
#-------------------------------------

nltk.download('punkt')
nltk.download('stopwords')

#-------------------------------------
#***********Reading Data**************
#-------------------------------------
print("Reading Data ......")
df = pd.read_csv('fakecorpusWithMeta.csv')

print("Cleaning the Data ......")
# Drop unused columns
df = df.drop(columns=["scraped_at", "index", "Unnamed: 0"])

print("Filtering Data ......")
# If title is missing drop the entry
df = df.dropna(subset=['title'])