class tf_idf(object):
  
  def __init__(self, data):
    # Load data
    self.data = data
    
    # Parser
    self.parsers = Parsers()
    
    # Init tf-idf
    click.echo("Creating: index and tf-idf")
    self.index, self.tf, self.df, self.idf = self.load_index_tfidf()
    click.echo("Done.\n")
    
    #Ask for query
    self.get_query()
    
    
    
  def get_query(self):
    # Ask for query
    active = True
    while(active):
      click.echo("######################################################")
      click.echo("TYPE 'X' TO EXIT.")
      click.echo("Insert query:")
      query = input()
      click.echo("######################################################\n")
      
      if query == 'X' or query =='x':
        click.echo("Exiting...")
        active = False
      else:
        self.search(query, self.index, self.idf, self.tf) 
        
    return 0
     
  def load_index_tfidf(self):
    """
      Loads the preprocesed returns:
      Returns:
        index --  inverted list "term": [["id",[pos1,pos1,..]].
        tf -- normalized term frequency per doc 
        df -- document frequency per term
        idf -- inversed docuemnt frequency
    """

    index = pickle.load(open("data/utils/index.p", "rb"))

    # Term freq of terms in tweets      
    tf = pickle.load(open("data/utils/tf.p", "rb"))

    # Tweet freq of term in corpus
    df = pickle.load(open("data/utils/df.p", "rb"))
    
    # Inverse df
    idf = pickle.load(open("data/utils/idf.p", "rb"))
  
    return index, tf, df, idf
        
  def rankDocuments(self, terms, docs, index, idf, tf):
    """
    Computes ranking given query and collection of tweets.

    Arguments:
      terms -- query - str.
      docs -- ID list of docs - list.
      index -- invertex index. - dict
      idf -- inverse document frequency - dict
      tf -- term frequency - dict
    Returns:
      resultDocs -- Ordered list of matching docs based on cosine-sim - list
    """

    # Dict with vector per docID
    docVectors = defaultdict(lambda: [0]*len(terms))

    # Vector per query
    queryVector = [0]*len(terms)

    # TF of query
    query_terms_count = collections.Counter(terms)
 
    # Norm query
    query_norm = np.linalg.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):
      # Check if term exist in collection
      if term not in index:
        continue

      # Score per term-query
      queryVector[termIndex] = query_terms_count[term]/query_norm * idf[term]

      for docIndex, (doc,postings) in enumerate(index[term]):
        # check if IDdoc is in list of IDdocs containg term
        if doc in docs:
          # Score per term-doc
          docVectors[doc][termIndex] = tf[term][docIndex] * idf[term]

    #Cosine similarity query-doc
    docScores = [[np.dot(curDocVec, queryVector), doc] for doc, curDocVec in docVectors.items()]

    #Sort by descending similarity
    docScores.sort(reverse=True)

    #Get IDs
    resultDocs = [x[1] for x in docScores]

    return resultDocs
  
  def search_tf_idf(self, query, index, idf, tf, topn):
    """
  Preprocess query and find docs with words in query
  Arguments:
    query -- query - str.
    index -- inverted index - dict
    idf -- inverse document frequency - dict
    tf -- term frequency - dict
    topn -- N top ranked docs to be returned - int
  Returns
    ranked_docs -- list of topn docs ranked by cosine-sim - list
    """
    # Preprocess query
    query = self.parsers.getTerms(query)
 
    # Init set of docs with terms in query
    docs = set()

    for term in query:
      try:
        # Get IDs of docs with term
        termDocs = [posting[0] for posting in index[term]]
      
        # Add new docsID
        docs = docs.union(termDocs)
    
      except:
        pass
    
    docs = list(docs)

    # Rank docs with rankDocuments
    ranked_docs = self.rankDocuments(query, docs, index, idf, tf)
    ranked_docs = ranked_docs[:topn]

    return ranked_docs
  
  def search(self, query, index, idf, tf, topn = 20):
    """
  Search for tweets inputing a query and see displayed results.
  Arguments:
    index -- inverted index - dict
    idf -- inverse document frequency - dict.
    tf -- term frequency - dict.
    topn -- default: 20 - Top N result to display - int.
    """
  
    # Get topn docs
    ranked_docs = self.search_tf_idf(query, index, idf, tf, topn)

    if len(ranked_docs) == 0:
      click.echo("No results found !\n")
      return -1
  
    click.echo("Results\n")

    for index, id in enumerate(ranked_docs):
      # Get tweet corresponding to id
      doc = self.data[self.data['id'] == id]
      tweet, date, author, retweets, favorites, url, hashtags = self.parsers.parser_tweet_results(doc)
    
      click.echo("______________________________________________________")
      click.echo(f"Tweet {index}")
      click.echo(f"\t·Author: {author}")
      click.echo(f"\t·Date: {date}")
      click.echo(f"\t·Tweet: {tweet}")
      click.echo(f"\t·Retweets: {retweets}")
      click.echo(f"\t·Favorites: {favorites}")
      click.echo(f"\t·Hashtags: {hashtags}")
      click.echo(f"\t·URL: {url}")
      click.echo("______________________________________________________\n")
class doc2vec(object):
    def __init__(self, data):
        # Get data
        self.data_Final = data

        #Processed data
        self.data = pd.read_csv("data/d2v_processed.csv")

        #Init NLP
        self.nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
        self.nlp.max_length = 5000000

        #Load d2v model
        self.doc2vec_model = Doc2Vec.load("data/utils/d2v_model.kvmodel")

        #Load contractions
        self.contractions_dict = pickle.load(
            open("data/utils/contractions_dict.p", "rb"))

        # Init parsers
        self.parsers = Parsers()

        # Load tag id
        self.tag_id = pickle.load(open("data/utils/tag_id.p", "rb"))

        #Load id doc2vec
        self.id_doc2vec = pickle.load(open("data/utils/id_doc2vec.p", "rb"))

        #Get query - run program
        self.get_query(self.id_doc2vec)

    def get_query(self, id_doc2vector):
        # Ask for query
        active = True
        while (active):
            click.echo(
                "\n######################################################")
            click.echo("TYPE 'X' TO EXIT.")
            click.echo("Insert query:")
            query = input()
            click.echo(
                "######################################################\n")

            if query == 'X' or query == 'x':
                click.echo("Exiting...")
                active = False
            else:
                self.search(query, self.tag_id, self.id_doc2vec)

        return 0

    def search(self, query, tag_id, id_doc2vector, topn=20):
        """
    Search for tweets inputing a query and see displayed results.
    Arguments:
        id_doc2vector -- dic containing id:vec2doc pair - dic
        topn -- default: 20 - Top N result to display - int.

    """
        # Get ranked docs
        doc_query_sim = self.rank(query, self.tag_id)
        ids = doc_query_sim[:topn]

        click.echo("Results\n")

        for index, id in enumerate(ids):
            doc = self.data_Final[self.data_Final["id"] == id]
            tweet, date, author, retweets, favorites, url, hashtags = self.parsers.parser_tweet_results(
                doc)

            click.echo(
                "______________________________________________________")
            click.echo(f"Tweet {index}")
            click.echo(f"\t·Author: {author}")
            click.echo(f"\t·Date: {date}")
            click.echo(f"\t·Tweet: {tweet}")
            click.echo(f"\t·Retweets: {retweets}")
            click.echo(f"\t·Favorites: {favorites}")
            click.echo(f"\t·Hashtags: {hashtags}")
            click.echo(f"\t·ULR: {url}")
            click.echo(
                "______________________________________________________\n")

    def rank(self, query, tag_id):
        """
      Given a query preprocesses it, embeds it and return ordered dictionary of id:similarity_score
      pair.
      """
        # Pre-process query
        query = self.preprocessing(query)

        # Query vector
        q_vector = self.doc2vec_model.infer_vector(query.split())

        #Doc query similarity
        tag_sim = self.doc2vec_model.docvecs.most_similar([q_vector], topn=20)

        # Get Ids
        ids = [tag_id[id_[0]] for id_ in tag_sim]

        return ids

    def expand_contractions(self, text, contractions_dict, contractions_re):
        """
      Given contraction find match and substitude
      """
        def replace(match):
            return contractions_dict[match.group(0)]

        return contractions_re.sub(replace, text)

    def clean_text(self, text):
        """
      * Remove words with digits
      * Replace newline characters with space
      * Remove URLS
      * Replace non english chars with space
      """
        # Remove digits
        text = re.sub('\w*\d\w*', '', text)

        # Remove new Line chars
        text = re.sub('\n', ' ', text)

        #Remove links
        text = re.sub(r"http\S+", "", text)

        #Replace non-english chars
        text = re.sub('[^a-z]', ' ', text)

        return text

    def preprocessing(self, text):
        """
      Given a pandas dataframe apply preprocessing techinques
          * Lowercase the text
          * Expand Contractions
          * Clean the text
          * Remove Stopwords
          * Lemmatize words
      """
        # Lower case
        text = text.lower()

        # Regular expression for finding contractions
        contractions_re = re.compile('(%s)' %
                                     '|'.join(self.contractions_dict.keys()))

        #Expand contractions
        text = self.expand_contractions(text, self.contractions_dict,
                                        contractions_re)
        text = self.clean_text(text)

        #Remove added spaces
        text = re.sub(" +", " ", text)
        text = text.strip()

        #Stop words and Lemmatizing
        text = ' '.join([
            token.lemma_ for token in list(self.nlp(text))
            if (token.is_stop == False)
        ])

        return text
class word2vec(object):
    def __init__(self, data):
        # Store data
        self.data_Final = data

        # Read prepocessed data
        self.data = pd.read_csv("data/w2v_processed.csv")

        # Init NLP
        self.nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
        self.nlp.max_length = 5000000

        # Load w2v model
        self.w2v_model = KeyedVectors.load("data/utils/w2v_model.kvmodel")

        # Load contractions
        self.contractions_dict = pickle.load(
            open("data/utils/contractions_dict.p", "rb"))

        # Init parser
        self.parsers = Parsers()

        # Load id_doc2vector
        self.id_doc2vector = pickle.load(open("data/utils/id_doc2vec.p", "rb"))

        # Get query
        self.get_query(self.id_doc2vector)

    def get_query(self, id_doc2vector):
        # Ask for query
        active = True
        while (active):
            click.echo(
                "######################################################")
            click.echo("TYPE 'X' TO EXIT.")
            click.echo("Insert query:")
            query = input()
            click.echo(
                "######################################################\n")

            if query == 'X' or query == 'x':
                click.echo("Exiting...")
                active = False
            else:
                self.search(query, self.id_doc2vector)

        return 0

    def expand_contractions(self, text, contractions_re):
        """
      Given contraction find match and substitude
    """
        def replace(match):
            return self.contractions_dict[match.group(0)]

        return contractions_re.sub(replace, text)

    def clean_text(self, text):
        """
    * Remove words with digits
    * Replace newline characters with space
    * Remove URLS
    * Replace non english chars with space
    """
        # Remove digits
        text = re.sub('\w*\d\w*', '', text)

        # Remove new Line chars
        text = re.sub('\n', ' ', text)

        #Remove links
        text = re.sub(r"http\S+", "", text)

        #Replace non-english chars
        text = re.sub('[^a-z]', ' ', text)

        return text

    def preprocessing(self, text):
        """
    Given a pandas dataframe apply preprocessing techinques
        * Lowercase the text
        * Expand Contractions
        * Clean the text
        * Remove Stopwords
        * Lemmatize words
    """
        # Lower case
        text = text.lower()

        # Regular expression for finding contractions
        contractions_re = re.compile('(%s)' %
                                     '|'.join(self.contractions_dict.keys()))

        #Expand contractions
        text = self.expand_contractions(text, contractions_re)
        text = self.clean_text(text)

        #Remove added spaces
        text = re.sub(" +", " ", text)
        text = text.strip()

        #Stop words and Lemmatizing
        text = ' '.join([
            token.lemma_ for token in list(self.nlp(text))
            if (token.is_stop == False)
        ])

        return text

    def embedding_w2v(self, doc_tokens):
        """
    Returns vector representation of a string
    """
        embeddings = []
        if len(doc_tokens) < 1:
            return np.zeros(100)
        else:
            for t in doc_tokens:
                if t in self.w2v_model.wv.vocab:
                    embeddings.append(self.w2v_model.wv.word_vec(t))
                else:
                    embeddings.append(np.random.rand(100))

        return np.mean(embeddings, axis=0)

    def w2v_collection(self, data):
        """
    Given a collection of documents returns the pair id:vector where the vector is
    the embedding representation of the doc.
    """
        id_doc2v = {}
        for id, text in zip(data["id"].values, data["full_text"]):
            id_doc2v[id] = self.embedding_w2v(text)

        return id_doc2v

    def rank(self, query, id_doc2vec):
        """
    Given a query preprocesses it, embeds it and return ordered dictionary of id:similarity_score
    pair.
    """
        # Pre-process query
        query = self.preprocessing(query)

        # Query vector
        q_vector = self.embedding_w2v(query.split())

        #Doc query similarity
        doc_query_sim = {
            k: cosine_similarity(
                np.array(v).reshape(1, -1),
                np.array(q_vector).reshape(1, -1))
            for k, v in id_doc2vec.items()
        }

        # Sort
        doc_query_sim = {
            k: v
            for k, v in sorted(
                doc_query_sim.items(), key=lambda item: item[1], reverse=True)
        }

        return doc_query_sim

    def search(self, query, id_doc2vector, topn=20):
        """
    Search for tweets inputing a query and see displayed results.
    Arguments:
    id_doc2vector: dic containing id:vec2doc pair - dic
    topn -- default: 20 - Top N result to display - int.

    """
        # Get ranked docs
        doc_query_sim = self.rank(query, id_doc2vector)
        ids = list(doc_query_sim.keys())[:topn]

        click.echo("Results\n")

        for index, id in enumerate(ids):
            doc = self.data_Final[self.data_Final["id"] == id]
            tweet, date, author, retweets, favorites, url, hashtags = self.parsers.parser_tweet_results(
                doc)

            print("______________________________________________________")
            print(f"Tweet {index}")
            print(f"\t·Author: {author}")
            print(f"\t·Date: {date}")
            print(f"\t·Tweet: {tweet}")
            print(f"\t·Retweets: {retweets}")
            print(f"\t·Favorites: {favorites}")
            print(f"\t·Hashtags: {hashtags}")
            print(f"\t·URL: {url}")
            print("______________________________________________________\n")