Ejemplo n.º 1
0
class FlairEmbeddings(object):
    def __init__(self):
        self.stop_words = list(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=[flair_embedding_forward, flair_embedding_backward])

    def word_token(self, tokens, lemma=False):
        tokens = str(tokens)
        tokens = re.sub(
            r"([\w].)([\~\!\@\#\$\%\^\&\*\(\)\-\+\[\]\{\}\/\"\'\:\;])([\s\w].)",
            "\\1 \\2 \\3", tokens)
        tokens = re.sub(r"\s+", " ", tokens)
        if lemma:
            return " ".join([
                self.lemmatizer.lemmatize(token, 'v')
                for token in word_tokenize(tokens.lower())
                if token not in self.stop_words and token.isalpha()
            ])
        else:
            return " ".join([
                token for token in word_tokenize(tokens.lower())
                if token not in self.stop_words and token.isalpha()
            ])

    def cos_sim(self, a, b):
        return np.inner(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

    def getFlairEmbedding(self, text):
        sentence = Sentence(text)
        self.stacked_embeddings.embed(sentence)
        return np.mean([np.array(token.embedding) for token in sentence],
                       axis=0)
Ejemplo n.º 2
0
class Embedder(object):

    def __init__(self, embedding=None, method=None, batch_size=5):

        assert method in [None, "average"], "Bad method"
        self.method = method
        self.batch_size = batch_size

        if embedding is not None:
            self.embedding = StackedEmbeddings(embedding)
        else:
            self.embedding = StackedEmbeddings([
                #WordEmbeddings('glove'),
                #WordEmbeddings('en-news'),
                #BytePairEmbeddings('en'),
                WordEmbeddings('crawl')
            ])

    def embed_data(self, sentences):
        sentences = [Sentence(s) for s in sentences]
        self.embedding.embed(sentences)

        if self.method == "average":
            sentences = [torch.stack([word.embedding.detach().cpu() for word in s]).mean(
                0) for s in sentences]
        else:
            sentences = [torch.stack(
                [word.embedding.detach().cpu() for word in s]) for s in sentences]

        return sentences

    def embed_dataset(self, sentences):
        sentences = self.embed_data(sentences)
        return sentences
Ejemplo n.º 3
0
def test_stacked_embeddings():
    (sentence, glove, charlm) = init_document_embeddings()
    embeddings = StackedEmbeddings([glove, charlm])
    embeddings.embed(sentence)
    for token in sentence.tokens:
        assert (len(token.get_embedding()) == 1124)
        token.clear_embeddings()
        assert (len(token.get_embedding()) == 0)
Ejemplo n.º 4
0
class FlairPretrained(ModelBase):
    """
    Encapsulates pretrained Flair Embeddings (Zalando Flair) by conforming to the ModelBase interface.
    """
    def __init__(self, model=None):
        super(FlairPretrained, self).__init__()

        if model is not None:
            self.model = model
        else:
            self.model = StackedEmbeddings([
                FlairEmbeddings('news-forward-fast'),
                FlairEmbeddings('news-backward-fast'),
            ])

    def dim(self) -> int:
        """
        The dimensionality of created embeddings.

        :return: 2048 (for now, #TODO)
        """
        return 2048

    def get_word_vector(self, word: str) -> Optional[np.ndarray]:
        """
        Returns the word vector for word |word| or None. It is discouraged to use this method as it invalidates the
        purpose of Flair embeddings. Instead, utilize the context as well for more accurate vectorization.

        In reality, Flair embeddings never return None, even for bogus words.

        :param word: The word to vectorize.
        :return: Either the word vector or None.
        """
        dummy_sentence = Sentence(word)
        self.model.embed(dummy_sentence)
        return np.array(list(dummy_sentence)[0].embedding)

    def get_word_vectors(self, words: List[str]) -> List[np.ndarray]:
        """
        Vectorizes the list of words, using pretrained Flair embeddings. These embeddings are context dependent, so this
        method is preferred over fetching word vectors for single words.

        :param words: The list of words to vectorize.
        :return: A list of word vectors.
        """
        sentence = Sentence(' '.join(words))
        self.model.embed(sentence)
        return list(
            map(lambda token: np.array(token.embedding), list(sentence)))

    def vectorize_context(self, words: List[str]) -> Optional[np.ndarray]:
        """
        Transforms the context into a single vector. May return None in extreme cases, e.g. if |words| is an empty list.

        :param words: List of tokens describing the context.
        :return: A single word vector or None.
        """
        return self.mean_of_words(self.get_word_vectors(words))
Ejemplo n.º 5
0
class DefaultFeaturizerForSeqTagging(ObservationFeaturizer):
    def __init__(self,
                 action_space: ActionSpace,
                 embedding_type: str = "fasttext",
                 device: str = "cpu"):
        self.device = device
        self._setup_device()
        embeddings = EmbeddingRegistry.get_embedding(embedding_type)
        self.doc_embeddings = StackedEmbeddings(embeddings).to(
            torch.device(self.device))
        self.action_space = action_space
        self._current_token_embeddings: List[torch.tensor] = None

    def _setup_device(self):
        import flair, torch
        flair.device = torch.device(self.device)

    def init_on_reset(self, input_text: Union[List[str], str]):
        sent = Sentence(input_text)
        self.doc_embeddings.embed(sent)
        self._current_token_embeddings = [
            token.embedding.cpu().detach() for token in sent
        ]
        sent.clear_embeddings()

    def featurize(self, observation: Observation) -> torch.Tensor:
        input_vector = self._featurize_input(observation.get_current_index())
        context_vector = self._featurize_context(
            observation.get_current_action_history())
        concatenated = torch.cat((input_vector, context_vector), dim=0)
        return concatenated

    def get_observation_dim(self) -> int:
        return self._get_input_dim() + self._get_context_dim()

    def _featurize_input(self, input_index: int) -> torch.Tensor:
        input_features = self._current_token_embeddings[input_index]
        return input_features

    def _featurize_context(self, context: List[str]) -> torch.Tensor:
        # consider only last action
        context_vector = torch.zeros(self.action_space.size())
        context_ = [context[-1]] if len(context) > 0 else []
        action_indices = [
            self.action_space.action_to_ix(action) for action in context_
        ]
        context_vector[action_indices] = 1.0
        return context_vector

    def _get_input_dim(self):
        sent = Sentence("A random text to get the embedding dimension")
        self.doc_embeddings.embed(sent)
        dim = sent[0].embedding.shape[0]
        sent.clear_embeddings()
        return dim

    def _get_context_dim(self):
        return self.action_space.size()
Ejemplo n.º 6
0
def flair_vect(file):
  
  features = pd.read_csv(file, sep=";", skiprows = 0)
  text = features['cible_TT'] 
  txt = text.tolist()
  flair_forward  = FlairEmbeddings('news-forward-fast')
  flair_backward = FlairEmbeddings('news-backward-fast')
  stacked_embeddings = StackedEmbeddings( embeddings = [ 
                                                       flair_forward, 
                                                       flair_backward
                                                      ])
  # create a sentence #
  sentence = Sentence('Analytics Vidhya blogs are Awesome .')
  # embed words in sentence #
  stacked_embeddings.embed(sentence)
  for token in sentence:
    print(token.embedding)
  # data type and size of embedding #
  print(type(token.embedding))
  # storing size (length) #
  z = token.embedding.size()[0]
  
  # creating a tensor for storing sentence embeddings #
  s = torch.zeros(0,z)

  # iterating Sentence (tqdm tracks progress) #
  for t in tqdm(txt):   
    # empty tensor for words #
    w = torch.zeros(0,z)   
    sentence = Sentence(t)
    stacked_embeddings.embed(sentence)
    # for every word #
    for token in sentence:
      # storing word Embeddings of each word in a sentence #
      w = torch.cat((w,token.embedding.view(-1,z)),0)
    # storing sentence Embeddings (mean of embeddings of all words)   #
    s = torch.cat((s, w.mean(dim = 0).view(-1, z)),0)
    
  ## tensor to numpy array ##
  vect = s.numpy()   
  print(vect)

  print(features.shape)
  lb = LabelBinarizer()

  XX = lb.fit_transform(features.implicite.values)
  imp_encoded = pd.DataFrame(XX, columns = ["avis","doute","volonté"])
  features = pd.concat([features, imp_encoded], axis=1)
  labels = np.array(features.iloc[:,36:])
  print(labels)
  
  # Split the data into training and testing sets
  train_features, test_features, train_labels, test_labels = train_test_split(vect, labels, test_size = 0.25, random_state = 100)
  
  labels_pca = features["implicite"]
  
  return train_features, test_features, train_labels, test_labels, labels_pca, vect
class LayerFlairEmbeddings(LayerBase):
    """LayerBertEmbeddings implements character-level embeddings."""
    def __init__(self,gpu):
        super(LayerFlairEmbeddings, self).__init__(gpu)
        self.gpu = gpu
        # self.flair_embeddings_dim = flair_embeddings_dim
        # self.freeze_flair_embeddings = freeze_flair_embeddings

        self.output_dim = 4096

        self.flair_embedding_forward = FlairEmbeddings('/home/jlfu/flair_model/news-forward-0.4.1.pt')
        self.flair_embedding_backward = FlairEmbeddings('/home/jlfu/flair_model/news-backward-0.4.1.pt')
        self.stacked_embeddings = StackedEmbeddings([
            self.flair_embedding_forward,
            self.flair_embedding_backward
        ])
        # self.glove_embedding = WordEmbeddings('glove')
        # self.args= args
        # if self.args.use_flair_glove:
        #     self.stacked_embeddings = StackedEmbeddings([
        #         self.glove_embedding,
        #         self.flair_embedding_forward,
        #         self.flair_embedding_backward
        #     ])
        #     self.output_dim = 4096



    def is_cuda(self):
        return self.embeddings.weight.is_cuda

    def forward(self, word_sequences):
        batch_size = len(word_sequences)
        max_seq_len = max([len(word_seq) for word_seq in word_sequences])
        flair_embedding = torch.zeros(batch_size, max_seq_len, self.output_dim)

        # create a sentence
        for i,word_sequence in enumerate(word_sequences):
            word_seq_str = ' '.join(word_sequence)
            sentence = Sentence(word_seq_str)
            # self.flair_embedding_forward.embed(sentence)
            self.stacked_embeddings.embed(sentence)
            for j,token in enumerate(sentence):
                # print('token.embedding',token.embedding)
                flair_embedding[i][j][:] = token.embedding
            # print('flair_embedding',flair_embedding)
            # break
        return flair_embedding
class FlairEmbeddings(object):

	def __init__(self):
		self.stop_words = list(stopwords.words('english'))
		# self.stop_words = []
		self.lemmatizer = WordNetLemmatizer()
		self.stacked_embeddings = StackedEmbeddings(
			embeddings=[flair_forward_embedding, flair_backward_embedding])


	def word_token(self, tokens, lemma=False):
		tokens = str(tokens)
		tokens = re.sub(r"([\w].)([\~\!\@\#\$\%\^\&\*\(\)\-\+\[\]\{\}\/\"\'\:\;])([\s\w].)", "\\1 \\2 \\3", tokens)
		tokens = re.sub(r"\s+", " ", tokens)
		if lemma:
			return " ".join([self.lemmatizer.lemmatize(token, 'v') for token in word_tokenize(tokens.lower()) if token not in self.stop_words and token.isalpha()])
		else:
			return " ".join([token for token in word_tokenize(tokens.lower()) if token not in self.stop_words and token.isalpha()])


	def cos_sim(self, a, b):
		return np.inner(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

	def getFlairEmbedding(self, text):
		sentence = Sentence(text)
		self.stacked_embeddings.embed(sentence)
		return np.mean([np.array(token.embedding) for token in sentence], axis=0)

	def main(self):
		jd = "If you would be interested please send an updated word document of your resume to vprinci synergishr.com and let me know when you are free to chat. Lead Engineer large start up in San Francisco. Permanent role. strong Python preferred Java C C or C programming and 2 years in a lead or senior engineer role. Interest in online music subscription based company. Great compensation benefits PTO equity etc. Web App Dev Google in Sunnyvale. Long term contract. 5 years of JavaScript HTML CSS AJAX jQuery Closure and or Backbone preferred . Reports to Head of Learning Development technology Frontend Dev Google in Mountain View. 6 month contract. 2 openings. HTML CSS JavaScript. Closure Angular or D3 . They are building an internal market intelligence tool that leverages visualization. The backend has been built and the frontend will be built using JavaScript and D3 libraries."
		
		resume = "OZAN MANAV Istanbul Turkey www.ozanmanav.com +90 551 860 2015 Tranings Volunteer works - Microsoft Student Partner - Google Scholarship - Udemy Pluralsight Javascript Learning Path EDUCATION BACHELOR DEGREE ESKISEHIR OSMANGAZI UNIVERSITY BS in Computer engineering Sep 2012 June 2017 Responsibilities Works - I developed React Native and Native Mobile Applications. - QRcode Supported Membership System Android App Google Play- Elma Cafe Plus - IOT device management app based on logical values Google Play- Rim Control 4 AYTIM GROUP - TURKEY Aytim is the gaming textile company in Turkey providing services in all business lines. JUNIOR SOFTWARE ENGINEER Feb 2014 - Dec 2016 Responsibilities Works - I provided methodologies for object-oriented software development and efficient database design. - We have developed payment systems infrastructure together with the team. - I ve experienced UI testing frameworks like Selenium - I gave trainings to my team about Clean Code and TDD. From Robert C.Martin Books - I ve developed myself for secure software development. BDDK PCI Standards ARENA COMPUTER INC. or Payment Systems - TURKEY Arena is the leading provider of technology products and related supply chain management services in Turkey. Arena is characterised by its high level of innovation professional management and development strategies. Dec 2016 Jan 2018 SOFTWARE ENGINEER Im a Software Engineer familiar with a wide range of programming utilities and languages. Knowledgeable of backend and frontend development requirements. Able to handle any part of the process with ease. Collaborative team player with excellent technical abilities offering 4 years of related experience. JOB EXPERIENCESSOFTWARE ENGINEER Jan 2018 - Current ATP Ata Technology Platform - TURKEY ATP a leader in finance technologies addresses the needs of brokerage firms portfolio managers and insurance companies with comprehensive solutions and services. Its platforms handle a significant portion of the Istanbul Stock Exchanges trading volume. Responsibilities Works - Im supporting frontend mobile and web development and improvement process - Weve developed a dashboard with ReactJS and continuing maintenance with my team friends. - I ve developed react native screens in some parts of Native mobile projects. - In addition I developed mobile applications with React Native for Shiftdelete.net one of Turkey s largest tech news sites. [email protected] GraphQL AWS Docker TDD or BDD Agile or Scrum RESTFul APIs Node Webpack Git HTML5 CSS3 ES6 SOFTWARE ENGINEER Javascript React or React Native Redux CORE SKILLS PROFESSIONAL SUMMARY"
		resume = " Microsoft Word - Raviteja Kondubhatla.docx Ravteja Kondubhatla Data Scientist [email protected] Summary With my 5 years of experience in coding with analytical programming using Python SQL and Hadoop Id like to plan design and implement database solutions and work cross-functionally to customize client needs. My passion is to develop web application back end components and offer support to the front-end developers. Experience Data Scientist Cuna Mutual Group Wisconsin USA Oct 2018 - Present Implemented discretization and binning data wrangling cleaning transforming merging and reshaping data frames using python libraries like Numpy Scikit Matplotlib and Pandas Developed a propensity score generator for targeting the prospective Credit Union members using Machine Learning algorithms using Python Data Analyst Python Development Samsung California USA May 2018-Jul 2018 Automated batch test evaluation that allows a smooth flow of data from distributed data systems to the local machines and involved in Unit testing and Integration testing of the code Created a text normalizer using NLP for Bixby modules and created a workflow using technologies such as GIT Gained experience in working with various Python Integrated Development Environments like IDLE PyCharm Atom Eclipse and Sublime Text Senior Data Analyst Beroe Inc Chennai India May 2012 Dec 2016 Increased revenue by 40 by targeting the most profitable set of customers for a campaign about sustainability by performing a logistic regression technique Designed a product that provides actionable recommendations by identifying best cost sourcing suppliers LCCS for P G by making 95 accurate price forecasts in 2014-15 using elasticity modelling Projects Quantitative Analytics- Credit scoring model for loan applicants - Built a model to identify customers who were likely to default on a loan after extensive data cleaning-missing value and transforming the data outlier treatment . The model used was logistic regression with variables like total transactions purchase volume etc. Predictive Analytics Hospital Ranking - Analyzed hospital data and determined rank of all the hospitals in the United States of America based on the number of patients treated doctor availability and successful operations using python Skills Python SQL Hadoop Education University of Texas at Dallas M.S.in Data Analytics GPA 3.5 Jan 2017- Jul 2018 BITS Pilani B.E. in Engineering GPA 3.5 Aug 2007- May 2012 "
		# resume = " Microsoft Word - Shalini Channappa.docx Shalini Channappa Front End Web Developer [email protected] Hard-working web developer with a flair for creating elegant solutions in the least amount of time. Passionate about building responsive websites mobile apps and interactive features that drive business growth and improve UX. Experience Front End Web Developer Cisco 07 or 2018 - present Develop and test new components for the Digital partner advisor DPA project using Cisco UI Angular. Experience in developing single page applications using Angular. Improvise existing components and usability of various areas of the application working closely with a Product manager. Work in an Agile Scrum methodology on fast-moving projects. Extensive experience in UI web applications using HTML5 CSS3 Javascript XML jQuery AJAX JSON Angular and integrating Restful API s. Worked on eliminating bootstrap one of the two UI libraries of the application in order to avoid bloat overwriting and conflicts. Also handled the aftermath of the breakdown of layout and components and stabilized the application with release readiness in one sprint. Upgraded DPA to the current version of Cisco UI which was six versions behind and 90 of the library being overridden by custom definitions. Freelance Web Developer 08 or 2016 - 05 or 2018 Clients Turbo Tax Gabes Rentelo GPA Saver Translated design teams UX wireframes and mockups into responsive interactive features using HTML CSS and JavaScript. Worked with agile team to migrate legacy company website to a Wordpress site. Redesign of Gabes Android mobile app which increased downloads by 18 in less than 6 months. Increased email signups 12 by creating new UI for website landing page in React. Created highly detailed and annotated architectural wireframes. Successfully submitted MVPS. Actively participated in slack channels daily standups UI or UX design process code reviews responsive design managing project using Github s project Kanban board interface documentation testing and the final product launch. Manager Risk Investigations Amazon.com 09 or 2012 - 08 or 2016 Created grease monkey scripts to improve manual investigation efficiency by 115 . Created a script to review investigation steps dynamically and enable mistake proofing to improve investigation quality and reduce decision defect. Conducted a six sigma yellow belt Kaizen event with business operations analytics and software development team to determine and build machine learning model and variable to reduce incoming volume by 45 and saved 7.5 MM. Created dashboard for Amazon.in category management team using ETL jobs. Web Developer Intern Hindustan Aeronautics Limited 01 or 2011 - 05 or 2011 Designed UX wireframes and mockups and translated into interactive features using HTML CSS and JavaScript. Involved in writing stored procedures queries triggers and views. Wrote SQL queries to interact with SQL Server database. Web Developer Intern E Surveying Softtech 08 or 2010 - 12 or 2010 Handled search engine optimization SEO for the company s website resulting in which the website managed to top the Google search in survey related software. Performed Manual Testing on newly launched software technical content writing for upcoming software releases and web content development. EDUCATION Texas A M University 08 or 2016 - 05 or 2018 Master Of Science Computer Science GPA 3.91 SKILLS HTML CSS SQL JavaScript UI or UX Design Angular React Native AWARDS - Above and beyond awards in Q1 and Q3 of 2015 from Amazon.com - Received 6 employee of the month awards from Amazon.com - Awarded as the best Quality auditor during my tenure as quality auditor - Best new hire trainee from a batch of twelve in Amazon.com - Recipient of Grow with Google Developer Challenge Scholarship "
		# resume = "Roy Lee 628.209.9740 [email protected] github.com or rvlee linkedin.com or in or rvlee628 angel.co or rvlee SOFTWARE PROJECTS Optimizon System Design - http or or bit.ly or Optimizon Technologies used Express PostgreSQL New Relic Redis Artillery.io Loader.io NGINX Designed and optimized a microservice system to handle real-life traffic patterns with a production level database Improved traffic threshold from 800 RPS to 1300 RPS and latency from 253ms to 78ms by implementing Redis Caching and NGINX Load Balancing Analyzed architecture using Loader.io Artillery.io in conjunction with New Relic to identify performance bottlenecks Bear Bull Tech - Full Stack project managing stocks in real-time http or or bit.ly or bbt-checkout Technologies used Express MongoDB or Mongoose React.js Docker Integrated React.js Node.js and Express.js to build a financial forecast web application with dynamic features and extensible components Containerized a microservice and proxy server with Docker expediting interactions within AWS EC2 environment IoweU - Cost splitter that notifies multiple users about the total difference between a payment Technologies used Express React.js PostgreSQL Passport.js Built a web application to automatically split costs evenly among groups and notify members the total owed balance Implemented a user authentication system using Passport.js to ensure user privacy and seamless login or registration Converted the web app into an iOS or Android app using React Native to allow seamless migration onto mobile devices EXPERIENCE Beech International Inc. - Sales Rep or Product Manager - Taipei Taiwan or Shanghai China 2017 - 2018 Increased revenue by 10 and reduced cost by over 15 by facilitating overseas meetings with customers about quarterly forecasts and future sales projections Worked and collaborated closely with dealers customers and internal or external parties to identify new ideas and trends in the area of promotional products increasing sales by 7 Detekt Technology Inc. Additive Manufacturing Expert - Medical or R D - Taipei Taiwan 2014 - 2015 Reduced surgery time and risk by 50 and 80 through creating a surgical guide using 3D software to aid in the process of surgery Designed and developed a system to print 3D printed prototypes increasing the success rate during production Zalora - Operations Analyst or Head of Operations Team - Taipei Taiwan or Jakarta Indonesia 2012 - 2013 Analyzed data and reports generated from the company database with MySQL and evaluated fashion trends and sales data for buying and marketing department Analyzed fashion trends and sales for our marketing team increasing CTR by 13 and reducing bounce rate by 5 Developed Excel Macros to optimize operations across departments which led to a 20 and 30 increase in efficiency in the Customer Service and Warehouse department EDUCATION Pennsylvania State University 2007-2011 - Mechanical Engineering Economics B.S. Hack Reactor San Francisco 2018 - Advanced Software Engineering Immersive Program SKILLS Languages - Javascript Node.js ES6 HTML CSS Git NPM SQL Front End - React.js Redux React Router AngularJS jQuery Webpack Babel Jest or Enzyme Chai or Mocha Back End or Misc - Express MySQL Cassandra PostgreSQL MongoDB Mongoose Docker NGINX Redis AWS EC2 or S3 Heroku New Relic Artillery.io Loader.io CircleCI Other - AutoCAD English Mandarin Japanese"

		jd = self.word_token(jd, True)
		resume = self.word_token(resume, True)
		
		print("\n JD --- ", jd, len(jd))
		print("\n resume --- ",resume, len(resume))
		
		jd_embeddings = self.getFlairEmbedding(jd)
		resume_embeddings = self.getFlairEmbedding(resume)
		print(jd_embeddings.shape, resume_embeddings.shape)
		cosine_similairty = self.cos_sim(jd_embeddings, resume_embeddings)
		print("\n cosine_similairty --- ",cosine_similairty)
Ejemplo n.º 9
0
class EmbedSentence:
    """
    EmbedSentence class helps in embeddings a sentence

    """

    def __init__(self):
        """
        initialize the word embedding and document embedding classes
        """
        self.word_embedding = flair.embeddings.WordEmbeddings('glove')
        self.doc_embedding = flair.embeddings.DocumentPoolEmbeddings([self.word_embedding])

        # embedding
        self.flair_forward = FlairEmbeddings('news-forward-fast')
        self.backward_flair = FlairEmbeddings('news-backward-fast')

        # stacked embedding
        self.stacked_embedding = StackedEmbeddings(embeddings=[
            self.flair_forward,
            self.backward_flair])

    def embed_str(self, sentence: str) -> torch.Tensor:
        """
        This function converts a sentence to a Tensor of embeddings
        :param sentence: str, for example: 'hello world'
        :return: returns a tensor, of shape already predefined by flair
        """
        __sentence = Sentence(sentence)
        self.doc_embedding.embed(__sentence)
        return __sentence.embedding

    def stacked_embed(self, sentence: str, return_sentence: bool = False) -> Union[torch.Tensor, Sentence]:
        """

        :param sentence:
        :param return_sentence:
        :return:
        """
        __sentence = Sentence(sentence)
        self.stacked_embedding.embed(__sentence)

        if return_sentence:
            return __sentence
        else:
            return __sentence.embedding
Ejemplo n.º 10
0
 def vectorise(self, text):
     flair_forward  = FlairEmbeddings('news-forward')
     flair_backward = FlairEmbeddings('news-backward')
     stacked_embeddings = StackedEmbeddings( embeddings = [ 
                                                         flair_forward, 
                                                         flair_backward
                                                         ])
     sentence = Sentence(text)
     stacked_embeddings.embed(sentence)
     for token in sentence:
         #print(token.embedding)
         #print(type(token.embedding))
         z = token.embedding.size()[0]
     s = torch.zeros(0,z)
     w = torch.zeros(0,z)   
     w = torch.cat((w,token.embedding.view(-1,z)),0)
     s = torch.cat((s, w.mean(dim = 0).view(-1, z)),0)
     return s
Ejemplo n.º 11
0
class EasyStackedEmbeddings:
    """Word Embeddings that have been concatenated and "stacked" as specified by flair

    Usage:

    ```python
    >>> embeddings = adaptnlp.EasyStackedEmbeddings("bert-base-cased", "gpt2", "xlnet-base-cased")
    ```

    **Parameters:**

    * `*embeddings` - Non-keyword variable number of strings specifying the embeddings you want to stack
    """
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            self.embedding_stack.append(
                _get_embedding_model(model_name_or_path))

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)

    def embed_text(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
    ) -> List[Sentence]:
        """Stacked embeddings

        **Parameters**:
        * `text` - Text input, it can be a string or any of Flair's `Sentence` input formats

        **Return**:
        * A list of Flair's `Sentence`s
        """
        # Convert into sentences
        sentences = _make_sentences(text, as_list=True)

        # Unlike flair embeddings modules, stacked embeddings do not return a list of sentences
        self.stacked_embeddings.embed(sentences)
        return sentences
def get_embeddings(encoder, sentence, input_lang):
    with torch.no_grad():
        if word_vecs == "flair":
            flair_embedding = StackedEmbeddings([
                FlairEmbeddings('de-forward'),
                FlairEmbeddings('de-backward'),
            ])

            sent = Sentence(sentence + " <EOS>")
            flair_embedding.embed(sent)
            input_tensor = [token.embedding for token in sent.tokens]
            input_length = len(input_tensor)
        else:
            input_tensor = tensorFromSentence(input_lang, sentence)
            input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        for ei in range(input_length):
            _, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        return encoder_hidden
Ejemplo n.º 13
0
class FlairEmbeddings(emb.base.Embeddings):

    def __init__(self, forward, backward, use_tokenizer, *args, **kwargs):

        super(FlairEmbeddings, self).__init__(*args, **kwargs)

        self._forward = forward
        self._backward = backward
        self._use_tokenizer = use_tokenizer

        from flair.embeddings import FlairEmbeddings as FLEmbeddings
        from flair.embeddings import StackedEmbeddings

        self._embeddings = StackedEmbeddings([FLEmbeddings(forward), FLEmbeddings(backward)])

    def get(self, keys, return_positions):

        from flair.embeddings import Sentence

        sentences = [Sentence(key, use_tokenizer=self._use_tokenizer) for key in keys]

        # noinspection PyUnresolvedReferences
        self._embeddings.embed(sentences)

        for s_idx, (sentence, ret_positions) in enumerate(zip(sentences, return_positions)):

            for t_idx, token in enumerate(sentence):

                if t_idx not in ret_positions:
                    continue  # ignore tokens where embeddings have not been requested

                yield s_idx, token.text, token.embedding.cpu().numpy()

    def config(self):

        return {'description': self.description()}

    def description(self):

        return "flair-{}-{}".format(self._forward, self._backward)
def get_flair_vectors(vocab):
    print("Looking for flair vectors")
    #import flair embeddings!
    #we can change here to use different embeddings
    glove_embedding = WordEmbeddings('glove')
    twitter_embedding = WordEmbeddings('en-twitter')
    character_embeddings = CharacterEmbeddings()
    stacked_embeddings = StackedEmbeddings(embeddings=[glove_embedding, character_embeddings,twitter_embedding])

    flair_vectors = {}
    found = 0
    for word in vocab:
        wt=Sentence(word)
        stacked_embeddings.embed(wt)
        vector=wt[0].embedding.detach().numpy()
        #if the word is not in the embedding dict, the vector will be all zero
        if np.sum(np.abs(vector))>0:
            flair_vectors[word]=vector
            found += 1
    print('\n')
    print('Found %d words in GLOVE' % found)
    return flair_vectors
Ejemplo n.º 15
0
class FlairEmbedder(nn.Module):
    def __init__(self, flair_model="news", word="glove", finetune=False):
        super(FlairEmbedder, self).__init__()

        self.flair_model = flair_model
        self.word = word
        self.finetune = finetune

        embeddings = [
            FlairEmbeddings('{}-forward'.format(self.flair_model)),
            FlairEmbeddings('{}-backward'.format(self.flair_model))
        ]

        if self.word is not None:
            embeddings.append(WordEmbeddings(self.word))

        self.stacked_embeddings = StackedEmbeddings(embeddings)
        self.w_embed_dim = self.stacked_embeddings.embedding_length
        self.name = "flair"

    def forward(self, data):
        device = data["words"].device
        sentences = [Sentence(detokenize(s)) for s in data["sents"]]

        if self.finetune:
            self.stacked_embeddings.embed(sentences)

        else:
            self.stacked_embeddings.eval()
            with torch.no_grad():
                self.stacked_embeddings.embed(sentences)

        tensors = [
            torch.cat([token.embedding.unsqueeze(0) for token in sent], dim=0)
            for sent in sentences
        ]

        return {"embeddings": pad(tensors).to(device)}
Ejemplo n.º 16
0
class FlairEmbeddings(emb.base.Embeddings):

    def __init__(self, forward, backward, use_tokenizer, *args, **kwargs):

        super(FlairEmbeddings, self).__init__(*args, **kwargs)

        self._forward = forward
        self._backward = backward
        self._use_tokenizer = use_tokenizer

        from flair.embeddings import FlairEmbeddings as FLEmbeddings
        from flair.embeddings import StackedEmbeddings

        self._embeddings = StackedEmbeddings([FLEmbeddings(forward), FLEmbeddings(backward)])

    def get(self, keys):

        from flair.data import Sentence

        sentences = [Sentence(key, use_tokenizer=self._use_tokenizer) for key in keys]

        # noinspection PyUnresolvedReferences
        self._embeddings.embed(sentences)

        for s_idx, sentence in enumerate(sentences):

            for t_idx, token in enumerate(sentence):

                yield token.text, token.embedding.cpu().numpy()

    def config(self):

        return {'description': self.description()}

    def description(self):

        return "flair-{}-{}".format(self._forward, self._backward)
Ejemplo n.º 17
0
class Environment:
    def __init__(self, args, agent_mode):
        # initializes environment variables and then reads sentences.

        print('Initializing the Environment...')
        self.domain = args.domain
        self.dis_dim = args.dis_dim  # 50
        self.tag_dim = args.tag_dim  # 50
        self.word_dim = args.word_dim  # 50
        self.num_words = args.num_words  # 500
        self.action_rate = args.action_rate  # 0.1
        self.use_act_rate = args.use_act_rate  # 1
        # self.use_act_att = args.use_act_att  # 0
        self.reward_base = args.reward_base  # 50.0
        self.ra = args.reward_assign  # [1,2,3]
        self.word2vec = args.word2vec
        self.terminal_flag = False
        self.train_epoch_end_flag = False
        self.valid_epoch_end_flag = False
        self.max_data_char_len = 0
        self.max_data_sent_len = 0
        self.agent_mode = agent_mode  # args.agent_mode
        self.context_len = args.context_len  # 100

        if not args.gui_mode2:
            self.stacked_embeddings = args.stacked_embeddings
        elif args.gui_mode2:  #if gui mode  ..set different embeddings for different networks
            if agent_mode == 'act':
                self.word_dim = self.tag_dim = self.dis_dim = 3172
                self.stacked_embeddings = StackedEmbeddings([
                    WordEmbeddings('glove'),
                    BertEmbeddings('bert-base-uncased')
                ])
            elif agent_mode == 'arg':
                self.word_dim = self.tag_dim = self.dis_dim = 868
                self.stacked_embeddings = StackedEmbeddings(
                    [WordEmbeddings('glove'),
                     ELMoEmbeddings('small')])

        # read the sentences!!!
        if not args.gui_mode:
            if self.agent_mode == 'arg':

                indata = load_pkl('data/refined_%s_data.pkl' % self.domain)[-1]
                arg_sents = []

                for i in tqdm(range(len(indata))):
                    for j in range(len(indata[i])):
                        if len(indata[i][j]) == 0:
                            continue
                        # -1 obj_ind refer to UNK
                        # words = indata[i][j]['last_sent'] + indata[i][j]['this_sent'] + ['UNK'] # we don't need an unknown here.
                        words = indata[i][j]['last_sent'] + indata[i][j][
                            'this_sent']
                        current_sent = indata[i][j]['this_sent']
                        sent_len = len(
                            words)  #here sent len is last_sent + this_sent.
                        act_inds = [
                            a['act_idx'] for a in indata[i][j]['acts']
                            if a['act_idx'] < self.num_words
                        ]  #list of action indexes less than self.num_words = 128
                        for k in range(len(indata[i][j]['acts'])):
                            act_ind = indata[i][j]['acts'][k][
                                'act_idx']  # action index
                            obj_inds = indata[i][j]['acts'][k][
                                'obj_idxs']  # object index list
                            arg_sent = {}

                            # set arg tags
                            arg_tags = np.ones(sent_len,
                                               dtype=np.int32)  # tags

                            if len(obj_inds[1]) == 0:
                                arg_tags[obj_inds[0]] = 2  # essential objects
                            else:
                                arg_tags[obj_inds[0]] = 4  # exclusive objects
                                arg_tags[obj_inds[1]] = 4  # exclusive objects

                            # set distance
                            position = np.zeros(sent_len, dtype=np.int32)
                            position.fill(act_ind)
                            distance = np.abs(np.arange(sent_len) - position)

                            arg_sent['tokens'] = words
                            arg_sent['tags'] = arg_tags
                            arg_sent['act_ind'] = act_ind
                            arg_sent['distance'] = distance
                            arg_sent['act_inds'] = act_inds
                            arg_sent['obj_inds'] = obj_inds

                            # ipdb.set_trace()
                            sent_vec = []

                            if args.stacked_embeddings == 'word2vec':
                                for w in arg_sent['tokens']:
                                    if len(w) > self.max_data_char_len:
                                        self.max_data_char_len = len(w)
                                    if w in self.word2vec.vocab:
                                        sent_vec.append(self.word2vec[w])
                                    else:
                                        sent_vec.append(np.zeros(
                                            self.word_dim))
                            else:
                                # Stacked embeddings
                                line = ' '.join(words)
                                sent = Sentence(line)
                                args.stacked_embeddings.embed(sent)
                                for token in sent:
                                    sent_vec.append(token.embedding.numpy())

                                for w in arg_sent['tokens']:
                                    if len(w) > self.max_data_char_len:
                                        self.max_data_char_len = len(w)

                            sent_vec = np.array(sent_vec)
                            pad_len = self.num_words - len(sent_vec)

                            if len(sent_vec) > self.max_data_sent_len:
                                self.max_data_sent_len = len(sent_vec)

                            distance = np.zeros([self.num_words, self.dis_dim])
                            act_vec = sent_vec[arg_sent[
                                'act_ind']]  # word vector of the input action

                            # TODO: Attention is not required for contextual word embeddings, so commented it out to save time. Try it out if time permits.
                            # attention = np.sum(sent_vec * act_vec, axis=1)  # attention between the input action and its context
                            # attention = np.exp(attention)
                            # attention /= sum(attention)

                            if pad_len > 0:
                                # doc_vec = np.concatenate((doc_vec, np.zeros([pad_len, self.word_dim])))  # doc_vec.shape = [5oo, 5o]
                                # act_text['tags'] = np.concatenate((np.array(act_text['tags']), np.ones(pad_len, dtype=np.int32)))  # [500]
                                sent_vec = np.concatenate(
                                    (sent_vec,
                                     np.zeros([pad_len, self.word_dim])))  #
                                arg_sent['tags'] = np.concatenate(
                                    (np.array(arg_sent['tags']),
                                     np.ones(pad_len, dtype=np.int32)))
                                # attention = np.concatenate((attention, np.zeros(pad_len)))
                                for d in range(len(arg_sent['distance'])):
                                    distance[d] = arg_sent['distance'][d]
                            else:
                                sent_vec = sent_vec[:self.num_words]
                                arg_sent['tokens'] = arg_sent[
                                    'tokens'][:self.num_words]
                                arg_sent['tags'] = np.array(
                                    arg_sent['tags'])[:self.num_words]
                                # attention = attention[: self.num_words]
                                for d in range(self.num_words):
                                    distance[d] = arg_sent['distance'][d]

                            # TODO: Future work: Use attention
                            # if self.use_act_att:  # apply attention to word embedding
                            #     sent_vec = attention.reshape(-1, 1) * sent_vec

                            sent_vec = np.concatenate((sent_vec, distance),
                                                      axis=1)

                            arg_sent['sent_vec'] = sent_vec
                            arg_sent['tags'].shape = (self.num_words, 1)
                            # self.create_matrix(arg_sent,words) #create_matrix function
                            arg_sents.append(arg_sent)
                '''
                Split into train and test first. 
                Split train into train and val then.
                '''
                self.train_data, self.test_data = train_test_split(
                    arg_sents, test_size=0.2, random_state=1)
                self.train_data, self.validation_data = train_test_split(
                    self.train_data, test_size=0.2, random_state=1)

                self.train_steps = len(self.train_data) * self.num_words
                self.validation_steps = len(
                    self.validation_data) * self.num_words
                self.test_steps = len(self.test_data) * self.num_words

                self.num_train = len(self.train_data)
                self.num_validation = len(self.validation_data)
                self.num_test = len(self.test_data)

                print('\n\ntraining texts: %d\tvalidation texts: %d' %
                      (len(self.train_data), len(self.validation_data)))
                print('max_data_sent_len: %d\tmax_data_char_len: %d' %
                      (self.max_data_sent_len, self.max_data_char_len))
                print('self.train_steps: %d\tself.valid_steps: %d\n\n' %
                      (self.train_steps, self.validation_steps))

                print('\n\ntest texts: %d\t self.test_steps:%d\n' %
                      (len(self.test_data), self.test_steps))
            else:  #actions
                # self.read_act_texts()

                # read action texts into input_data

                input_data = load_pkl('data/%s_labeled_text_data.pkl' %
                                      self.domain)

                # unroll the stuff inside and store it in a list called act_texts
                act_texts = []
                for i in range(
                        len(input_data
                            )):  #until length of training examples (documents)
                    if len(
                            input_data[i]
                        ['words']) == 0:  #if there are no words in a document
                        continue
                    # act_text is a dictionary to store info.
                    act_text = {}
                    act_text['tokens'] = input_data[i][
                        'words']  #tokens = individual words
                    act_text['sents'] = input_data[i][
                        'sents']  #sents = sentences [['a ','cat ', 'runs.'], [ ], ...]
                    act_text['acts'] = input_data[i][
                        'acts']  #acts = [{},{},{}, ..] where {} = 4 tuple containing keys: [act_idx, obj_idxs, act_type, related_acts]
                    act_text['sent_acts'] = input_data[i][
                        'sent_acts']  #list of acts in a sentence for every sentence.
                    act_text['word2sent'] = input_data[i][
                        'word2sent']  # {0:0, 1:0, 2:0, .... 38:2....} Mapping of word_index to sentence_index
                    act_text['tags'] = np.ones(
                        len(input_data[i]['words']), dtype=np.int32
                    )  #same length as number of words in a document.
                    act_text['act2related'] = {}  #related actions

                    #for all action 4 tuples
                    for acts in input_data[i]['acts']:
                        act_text['act2related'][acts['act_idx']] = acts[
                            'related_acts']  # act_text['act2related'] = {act_idx: []} where [] is list of related actions
                        act_text['tags'][acts['act_idx']] = acts[
                            'act_type'] + 1  # TODO: 2, 3, 4? - why? act_text['tags'] = [2,3,4,2,2,3,3,4,4,...] where index of array is action_index

                    # self.create_matrix(act_text)
                    # Creating matrix
                    doc_vec = []

                    if args.stacked_embeddings != 'word2vec':
                        # doing Flair embeddings
                        for sent in tqdm(act_text['sents']):
                            line = ' '.join(sent)
                            sentence = Sentence(line)
                            args.stacked_embeddings.embed(sentence)
                            for token in sentence:
                                # print(token.embedding.shape)  # 4196

                                doc_vec.append(token.embedding.numpy())

                        #initialize word2vec or zeroes
                        for word in act_text['tokens']:
                            if len(word) > self.max_data_char_len:
                                self.max_data_char_len = len(
                                    word
                                )  #max_data_char_len shows longest word.
                            # if word in self.word2vec.vocab:
                            #     doc_vec.append(self.word2vec[word])
                            # else:
                            #     doc_vec.append(np.zeros(self.word_dim))

                    elif args.stacked_embeddings == 'word2vec':
                        # initialize word2vec or zeroes
                        for word in act_text['tokens']:
                            if len(word) > self.max_data_char_len:
                                self.max_data_char_len = len(
                                    word
                                )  # max_data_char_len shows longest word.
                            if word in self.word2vec.vocab:
                                doc_vec.append(self.word2vec[word])
                            else:
                                doc_vec.append(np.zeros(self.word_dim))

                    doc_vec = np.array(doc_vec)
                    pad_len = self.num_words - len(doc_vec)
                    if len(doc_vec) > self.max_data_sent_len:
                        self.max_data_sent_len = len(
                            doc_vec
                        )  #max_data_sent_len is length of longest document vector..

                    # print(doc_vec.shape)

                    if pad_len > 0:  #if not negative.
                        doc_vec = np.concatenate(
                            (doc_vec,
                             np.zeros([pad_len, self.word_dim
                                       ])))  # doc_vec.shape = [5oo, 5o]
                        act_text['tags'] = np.concatenate(
                            (np.array(act_text['tags']),
                             np.ones(pad_len, dtype=np.int32)))  # [500]
                    else:  #pad_len is negative
                        doc_vec = doc_vec[:self.num_words]  #pick first 500
                        act_text['tokens'] = act_text[
                            'tokens'][:self.
                                      num_words]  #also in tokens, first 500
                        act_text['tags'] = np.array(
                            act_text['tags']
                        )[:self.num_words]  #also in tags, first 500

                    act_text[
                        'sent_vec'] = doc_vec  # set sentence vec to 500,50 doc_vec
                    act_text['tags'].shape = (self.num_words, 1
                                              )  # redefine shape to 500,1

                    act_texts.append(
                        act_text)  #keep collecting documents in act_texts
                '''
                Split into train and test first. 
                Split train into train and val then.
                '''
                # seed makes sure dataset is always split in the same way randomly
                self.train_data, self.test_data = train_test_split(
                    act_texts, test_size=0.2, random_state=1)
                self.train_data, self.validation_data = train_test_split(
                    self.train_data, test_size=0.2, random_state=1)

                self.train_steps = len(
                    self.train_data
                ) * self.num_words  # length of train data * 500
                self.validation_steps = len(
                    self.validation_data
                ) * self.num_words  #length of validation data * 500 -- Why a step includes multiplication with num_words?  because each training and val example contains 500 words.
                self.test_steps = len(self.test_data) * self.num_words

                self.num_train = len(self.train_data)
                self.num_validation = len(self.validation_data)
                self.num_test = len(self.test_data)

                print('\n\ntraining texts: %d\tvalidation texts: %d' %
                      (len(self.train_data), len(self.validation_data)))
                print('max_data_sent_len: %d\tmax_data_char_len: %d' %
                      (self.max_data_sent_len,
                       self.max_data_char_len))  #sent len means doc len
                print('self.train_steps: %d\tself.valid_steps: %d\n\n' %
                      (self.train_steps, self.validation_steps))

                print('\n\ntest texts: %d\t self.test_steps:%d\n' %
                      (len(self.test_data), self.test_steps))

            args.train_steps = self.train_steps
            args.valid_steps = self.validation_steps  # validation steps
            args.test_steps = self.test_steps

    def restart(self, train_flag, init=False, test_flag=False):
        if train_flag:
            if init:
                self.train_text_ind = -1
                self.train_epoch_end_flag = False
            self.train_text_ind += 1
            if self.train_text_ind >= len(self.train_data):
                self.train_epoch_end_flag = True
                print('\n\n-----train_epoch_end_flag = True-----\n\n')
                return
            self.current_text = self.train_data[self.train_text_ind %
                                                self.num_train]
            print('\ntrain_text_ind: %d of %d' %
                  (self.train_text_ind, len(self.train_data)))
        elif test_flag:
            print("Testing unseen data")
            if init:
                self.test_text_ind = -1
                self.test_epoch_end_flag = False
            self.test_text_ind += 1
            if self.test_text_ind >= len(self.test_data):
                self.valid_epoch_end_flag = True
                print('\n\n-----test_epoch_end_flag = True-----\n\n')
                return
            self.current_text = self.test_data[self.test_text_ind]
            print('\ntest_text_ind: %d of %d' %
                  (self.test_text_ind, len(self.test_data)))

        else:
            if init:
                self.valid_text_ind = -1
                self.valid_epoch_end_flag = False
            self.valid_text_ind += 1
            if self.valid_text_ind >= len(self.validation_data):
                self.valid_epoch_end_flag = True
                print('\n\n-----valid_epoch_end_flag = True-----\n\n')
                return
            self.current_text = self.validation_data[self.valid_text_ind]
            print('\nvalid_text_ind: %d of %d' %
                  (self.valid_text_ind, len(self.validation_data)))

        self.text_vec = np.concatenate(
            (self.current_text['sent_vec'], self.current_text['tags']), axis=1)
        self.state = self.text_vec.copy()
        self.state[:, -1] = 0
        self.terminal_flag = False

    def act(self, action, word_ind):
        '''
        Performs action and returns reward
        even num refers to tagging action, odd num refer to non-action
        '''
        self.state[word_ind, -1] = action + 1
        # t_a_count = 0  #amount of tagged actions
        t_a_count = sum(self.state[:word_ind + 1, -1]) - (word_ind + 1)
        t_a_rate = float(t_a_count) / self.num_words

        label = self.text_vec[word_ind, -1]
        self.real_action_flag = False
        if self.agent_mode == 'arg':
            # text_vec is labelled data
            if label >= 2:
                self.real_action_flag = True
            if label == 2:
                if action == 1:
                    reward = self.ra[1] * self.reward_base
                else:
                    reward = -self.ra[1] * self.reward_base
            elif label == 4:
                right_flag = True
                if word_ind in self.current_text['obj_inds'][0]:
                    exc_objs = self.current_text['obj_inds'][1]
                else:
                    exc_objs = self.current_text['obj_inds'][0]
                for oi in exc_objs:  # exclusive objs
                    if self.state[oi, -1] == 2:
                        right_flag = False
                        break
                if action == 1 and right_flag:
                    reward = self.ra[2] * self.reward_base
                elif action == 2 and not right_flag:
                    reward = self.ra[2] * self.reward_base
                elif action == 2 and word_ind != self.current_text['obj_inds'][
                        1][-1]:
                    reward = self.ra[2] * self.reward_base
                else:
                    reward = -self.ra[2] * self.reward_base
            else:  # if label == 1: # non_action
                if action == 0:
                    reward = self.ra[0] * self.reward_base
                else:
                    reward = -self.ra[0] * self.reward_base

        else:  # self.agent_mode == 'act'
            if label >= 2:
                self.real_action_flag = True
            if label == 2:  # required action
                if action == 1:  # extracted as action
                    reward = self.ra[1] * self.reward_base
                else:  # filtered out
                    reward = -self.ra[1] * self.reward_base
            elif label == 3:  # optional action
                if action == 1:
                    reward = self.ra[0] * self.reward_base
                else:
                    reward = 0.0
            elif label == 4:  # exclusive action
                # ipdb.set_trace()
                assert word_ind in self.current_text['act2related']
                exclusive_act_inds = self.current_text['act2related'][word_ind]
                exclusive_flag = False
                not_biggest_flag = False
                for ind in exclusive_act_inds:
                    if self.state[ind, -1] == 2:  # extracted as action
                        exclusive_flag = True
                    if ind > word_ind:
                        not_biggest_flag = True
                if action == 1 and not exclusive_flag:
                    # extract current word and no former exclusive action was extracted
                    reward = self.ra[2] * self.reward_base
                elif action == 0 and exclusive_flag:
                    # filtered out current word because one former exclusive action was extracted
                    reward = self.ra[2] * self.reward_base
                elif action == 0 and not_biggest_flag:
                    # filtered out current word and at least one exclusive action left
                    reward = self.ra[2] * self.reward_base
                else:
                    reward = -self.ra[2] * self.reward_base
            else:  # if label == 1: # non_action
                if action == 0:
                    reward = self.ra[0] * self.reward_base
                else:
                    reward = -self.ra[0] * self.reward_base

        if self.use_act_rate and reward != 0:
            if t_a_rate <= self.action_rate and reward > 0:
                reward += 5.0 * np.square(t_a_rate) * self.reward_base
            else:
                reward -= 5.0 * np.square(t_a_rate) * self.reward_base
        # all words of current text are tagged, break
        if word_ind + 1 >= len(self.current_text['tokens']):
            self.terminal_flag = True

        return reward

    def getState(self):
        '''
        Gets current text state
        '''
        return self.state

    def isTerminal(self):
        '''
        Returns if tag_actions is done
        if all the words of a text have been tagged, then terminate
        '''
        return self.terminal_flag

# ==================================== GUI MODE functions/Driver Mode functions

    def init_predict_act_text(self, raw_text):

        text = {'tokens': [], 'sents': [], 'word2sent': {}}
        for s in raw_text:
            words = s.split()
            if len(words) > 0:
                for i in range(len(words)):  # for word 0 to word n-1
                    text['word2sent'][i + len(text['tokens'])] = [
                        len(text['sents']), i
                    ]
                text['tokens'].extend(words)
                text['sents'].append(words)

        sent_vec = np.zeros([self.num_words, self.word_dim + 1
                             ])  # 512 x (968 + 1) ------ 1 for tag

        if self.stacked_embeddings == 'word2vec':
            for i, w in enumerate(text['tokens']):
                if i >= self.num_words:
                    break
                if w in self.word2vec.vocab:
                    sent_vec[i][:self.word_dim] = self.word2vec[w]

        else:
            word_count = 0
            z = 3172  #bert

            s = torch.zeros(0, z)
            for sent in tqdm(text['sents']):
                line = ' '.join(sent)
                sentence = Sentence(line)

                self.stacked_embeddings.embed(sentence)
                w = torch.zeros(0, z)
                for token in sentence:
                    # print(token.embedding.shape)  # 868 for elmo
                    sent_vec[word_count][:self.
                                         word_dim] = token.embedding.numpy()
                    word_count += 1
                    w = torch.cat((w, token.embedding.view(-1, z)),
                                  0)  #stack the words
                s = torch.cat((s, w.mean(dim=0).view(-1, z)), 0)  #average them

        self.state = sent_vec
        self.terminal_flag = False
        self.current_text = text
        return s

    def init_predict_arg_text(self, act_idx, text):
        '''used in gui mode'''
        self.terminal_flag = False
        sents = text['sents']
        word2sent = text['word2sent']
        sent_idx = word2sent[act_idx][0]
        word_ids = []
        this_sent = sents[sent_idx]
        if sent_idx > 0:  # use the former sentence and current one
            last_sent = sents[sent_idx - 1]
            for k, v in word2sent.items():
                if v[0] == sent_idx or v[0] == sent_idx - 1:
                    word_ids.append(k)
        else:
            last_sent = []
            for k, v in word2sent.items():
                if v[0] == sent_idx:
                    word_ids.append(k)
        words = last_sent + this_sent  #+ ['UNK']
        end_idx = max(word_ids)  # the last index of words of these two sents
        start_idx = min(word_ids)
        sent_len = len(words)

        position = np.zeros(sent_len, dtype=np.int32)
        position.fill(act_idx - start_idx)
        distance = np.abs(np.arange(sent_len) - position)
        # sent_vec = np.zeros([self.context_len, self.word_dim + self.dis_dim + self.tag_dim])
        sent_vec = np.zeros(
            [self.context_len, self.word_dim + self.dis_dim + 1])  # 100x101

        if self.stacked_embeddings == 'word2vec':
            for i, w in enumerate(words):
                if i >= self.context_len:
                    break
                if w in self.word2vec.vocab:
                    sent_vec[i][:self.word_dim] = self.word2vec[w]
                sent_vec[i][self.word_dim:self.word_dim +
                            self.dis_dim] = distance[i]
        else:

            for i, w in enumerate(words):
                if i >= self.context_len:
                    break
                # if w in self.word2vec.vocab:
                #     sent_vec[i][: self.word_dim] = self.word2vec[w]
                # sent_vec[i][self.word_dim: self.word_dim + self.dis_dim] = distance[i]

            #stacked embeddings
            full_sent = ' '.join(words)
            full_sent = Sentence(full_sent)
            self.stacked_embeddings.embed(full_sent)

            for i, token in enumerate(full_sent):
                sent_vec[i][:self.word_dim] = token.embedding.numpy()
                sent_vec[i][self.word_dim:self.word_dim +
                            self.dis_dim] = distance[i]
        self.state = sent_vec
        self.current_text = {
            'tokens': words,
            'word2sent': word2sent,
            'distance': distance
        }
        return last_sent, this_sent

    def act_online(self, action, word_ind):
        '''used in gui mode'''
        self.state[word_ind, -1] = action + 1
        # print(self.state[word_ind, self.word_dim: self.word_dim + self.dis_dim]) #distance
        # self.state[word_ind, -self.tag_dim:] = action + 1 #from 868 from last to end change it
        # print(self.state.shape)
        if word_ind + 1 >= len(self.current_text['tokens']):
            self.terminal_flag = True
def trainIters_supervised(encoder, model, train_data, val_data, batch_size, n_iters, input_lang, print_every=100,
                          plot_every=10, learning_rate=0.1, weight_decay=1e-4, loss_threshold=0.01, save_every=10, evaluate_every=10):
    labels = {"contradiction": 1, "neutral": 0, "entailment": 0}
    class_names = copy.deepcopy(np.array(list(labels.keys())))

    data = []
    for sentence_pair in train_data:
        normalizedval1 = sentence_pair[0].lower().strip()
        normalizedval1 = re.sub(r"([.!?])", r" \1", normalizedval1)
        normalizedval2 = sentence_pair[1].lower().strip()
        normalizedval2 = re.sub(r"([.!?])", r" \1", normalizedval2)
        if len(normalizedval1.split(' ')) < MAX_LENGTH and len(normalizedval2.split(' ')) < MAX_LENGTH \
                and sentence_pair[2] != "-":
            data.append([normalizedval1, normalizedval2, labels[sentence_pair[2]]])

    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate, weight_decay=weight_decay)
    model_optimizer = optim.SGD(model.parameters(), weight_decay=weight_decay, lr=learning_rate)
    #encoder_optimizer = optim.Adam(encoder.parameters(), weight_decay=weight_decay, lr=learning_rate)  # Adam
    #model_optimizer = optim.Adam(model.parameters(), weight_decay=weight_decay, lr=learning_rate)  # Adam

    if word_vecs=="flair":
        flair_embedding = StackedEmbeddings([
            FlairEmbeddings('de-forward'),
            FlairEmbeddings('de-backward'),
        ])
        input_tensors_1 = []
        input_tensors_2 = []
        for training_pair in data:
            sentence1 = Sentence(training_pair[0] + " <EOS>")
            flair_embedding.embed(sentence1)
            input_tensors_1.append(torch.stack([token.embedding for token in sentence1.tokens]))

            sentence2 = Sentence(training_pair[1] + " <EOS>")
            flair_embedding.embed(sentence2)
            input_tensors_2.append(torch.stack([token.embedding for token in sentence2.tokens]))

    else:
        input_tensors_1 = [tensorFromSentence(input_lang, training_pair[0]) for training_pair in data]
        input_tensors_2 = [tensorFromSentence(input_lang, training_pair[1]) for training_pair in data]

    targets = [training_pair[2] for training_pair in data]

    scheduler1 = ReduceLROnPlateau(encoder_optimizer, 'min', verbose=True)
    scheduler2 = ReduceLROnPlateau(model_optimizer, 'min', verbose=True)

    # compensate for unbalanced classes
    targets2 = torch.from_numpy(np.array(targets)).to(device)
    positive_weight = float(targets2.shape[0]) / torch.sum(targets2).type(torch.float32)
    negative_weight = float(targets2.shape[0]) / (targets2.shape[0] - torch.sum(targets2)).type(torch.float32)
    #criterion = nn.NLLLoss(weight=torch.tensor([negative_weight, positive_weight]).to(device))  # [0.5,1.0]
    criterion = nn.CrossEntropyLoss(weight=torch.tensor([negative_weight, positive_weight]).to(device))  # [0.5,1.0]

    print_loss_avg_last = 0.0
    all_data = list(zip(input_tensors_1, input_tensors_2, targets))
    for iter in range(1, n_iters + 1):
        # create new random batches in each iteration
        batches = []
        current_batch_inputs1 = []
        current_batch_inputs2 = []
        current_batch_targets = []

        random.shuffle(all_data)
        for input_tensor_1, input_tensor_2, target in all_data:
            if len(current_batch_targets) < batch_size:
                current_batch_inputs1.append(input_tensor_1)
                current_batch_inputs2.append(input_tensor_2)
                current_batch_targets.append(target)
            else:
                batches.append((copy.deepcopy(current_batch_inputs1), copy.deepcopy(current_batch_inputs2),
                                torch.from_numpy(np.array(current_batch_targets)).to(device)))
                current_batch_inputs1 = []
                current_batch_inputs2 = []
                current_batch_targets = []

        # for i, batch in enumerate(dataloader):
        for batch in batches:
            encoder_optimizer.zero_grad()
            model_optimizer.zero_grad()
            concat_tensors = []

            input_tensors_1 = batch[0]
            input_tensors_2 = batch[1]
            targets = batch[2]

            for input_tensor_1, input_tensor_2 in zip(input_tensors_1, input_tensors_2):
                # get the embedding for the first sentence
                encoder_hidden_1 = encoder.initHidden()

                input_length_1 = input_tensor_1.size(0)
                input_length_2 = input_tensor_2.size(0)

                for ei in range(input_length_1):
                    _, encoder_hidden_1 = encoder(
                        input_tensor_1[ei], encoder_hidden_1)

                emb1 = torch.max(encoder_hidden_1[0][0], encoder_hidden_1[1][0])

                # re-initialize encoder hidden state and get the embedding for the second sentence
                encoder_hidden_2 = encoder.initHidden()

                for ei in range(input_length_2):
                    _, encoder_hidden_2 = encoder(
                        input_tensor_2[ei], encoder_hidden_2)
                emb2 = torch.max(encoder_hidden_2[0][0], encoder_hidden_2[1][0])

                # concatenate the embeddings
                concat_tensors.append(torch.cat((emb1, emb2), -1).to(device))

                #gc.collect()

            concat_tensors = torch.stack(tuple(concat_tensors))

            log_probs = model.forward(concat_tensors)

            loss = criterion(log_probs, targets)
            loss.backward()

            encoder_optimizer.step()
            model_optimizer.step()

            print_loss_total += loss.item()
            plot_loss_total += loss.item()

            gc.collect()

        scheduler1.step(print_loss_total)
        scheduler2.step(print_loss_total)
        del batches

        if iter % save_every == 0:
            torch.save(encoder.state_dict(),
                       "/data/maren_semantic_analysis/E2E/{0}/encoder_epoch_{1}.pt".format(model_save_name, iter))
            torch.save(model.state_dict(),
                       "/data/maren_semantic_analysis/E2E/{0}/model_epoch_{1}.pt".format(model_save_name, iter))

        if iter % evaluate_every == 0:
            print("Performance in epoch {0}:".format(iter))
            train_embeddings, train_y, _ = get_embeddings_for_dataset(train_data, encoder, input_lang,
                                                                      bidirectional=True, pooling="max")
            val_embeddings, val_y, _ = get_embeddings_for_dataset(val_data, encoder, input_lang,
                                                                    bidirectional=True, pooling="max")

            with torch.no_grad():
                pred = []
                for vector, label in zip(val_embeddings, val_y):
                    # log_probs = model(torch.from_numpy(vector.todense()).type(torch.float32))
                    log_probs = model(vector)
                    log_probs_cpu = log_probs.cpu()
                    pred.append(int(np.argmax(log_probs_cpu)))
                    del log_probs_cpu

            acc = accuracy_score(val_y, pred)
            print("Accuracy of the model: {0}".format(acc))

            print_confusion_matrix(val_y, pred, classes=class_names, normalize=False)

            train_embeddings_cpu = train_embeddings.cpu()
            val_embeddings_cpu = val_embeddings.cpu()

            clf = LogisticRegression(solver="newton-cg", multi_class="multinomial", class_weight="balanced")  # class_weight="balanced"
            clf.fit(train_embeddings_cpu, train_y)
            pred = clf.predict(val_embeddings_cpu)
            acc = accuracy_score(val_y, pred)
            print("Accuracy of Logistic regression: {0}".format(acc))

            del clf
            del train_embeddings_cpu
            del val_embeddings_cpu

            print_confusion_matrix(val_y, pred, classes=class_names, normalize=False)

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_total))
            print_loss_total = 0

            if loss_threshold is not None and np.abs(print_loss_avg - print_loss_avg_last) < loss_threshold:
                print("Loss has converged! Stopping training")
                print("Epoch {0}".format(iter))
                break
            # print_loss_avg_last = copy.deepcopy(print_loss_avg)

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
        gc.collect()
Ejemplo n.º 19
0
sentence = Sentence('The grass is green .')
embedding.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#Elmo Embedding加载训练
embedding = ELMoEmbeddings()
sentence = Sentence('The grass is green .')
embedding.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#混合Embedding加载训练
stacked_embeddings = StackedEmbeddings([WordEmbeddings('model/glove.gensim'), FlairEmbeddings('model/news-forward-0.4.1.pt')])
sentence = Sentence('The grass is green .')
stacked_embeddings.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#Character Embeddings和BytePairEmbeddings,无法翻墙则运行是下载会报错
embedding = CharacterEmbeddings()
sentence = Sentence('The grass is green .')
embedding.embed(sentence)
embedding = BytePairEmbeddings('en')
sentence = Sentence('The grass is green .')
embedding.embed(sentence)

Ejemplo n.º 20
0
from flair.embeddings import WordEmbeddings, TransformerWordEmbeddings, StackedEmbeddings, GazetteerEmbeddings
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from datasets import list_datasets, load_dataset, list_metrics, load_metric

# corpus1 = WNUT_17()
# corpus = CONLL_03()
sentences_1 = Sentence('I love Sandys Fort Spring!')
sentences_2 = Sentence('The Land Tenure Reform Association (LTRA).')
sentence_list = [sentences_1, sentences_2]

label_list = ['PER', 'ORG', 'LOC', 'MISC']
glove_embedding = WordEmbeddings('glove')
gazetteer_embedding: GazetteerEmbeddings = GazetteerEmbeddings(
    path_to_gazetteers="../test_gazetteerEmbeddings/resources",
    partial_matching=True,
    full_mathing=True,
    label_list=label_list)
# gazetteer_embedding.embed(sentence_list)
print(gazetteer_embedding.feature_list)

stacked_embeddings = StackedEmbeddings([glove_embedding, gazetteer_embedding])

stacked_embeddings.embed(sentence_list)

for sentence in sentence_list:
    for token in sentence:
        print(token)
        print(token.embedding)
Ejemplo n.º 21
0
class TPLinkerNER(nn.Module):
    def __init__(self, char_encoder_config, word_encoder_config, flair_config,
                 handshaking_kernel_config, hidden_size, activate_enc_fc,
                 entity_type_num, bert_config):
        super().__init__()
        '''
        char_encoder_config = {
            "char_size": len(char2idx), # 
            "emb_dim": char_emb_dim,
            "emb_dropout": char_emb_dropout,
            "bilstm_layers": char_bilstm_layers,
            "bilstm_dropout": char_bilstm_dropout,
            "max_char_num_in_tok": max_char_num_in_tok,
        }
        bert_config = {
            "path": encoder_path,
            "fintune": bert_finetune,
            "use_last_k_layers": use_last_k_layers_hiddens,
        }
        word_encoder_config = {
            "init_word_embedding_matrix": init_word_embedding_matrix,
            "emb_dropout": word_emb_dropout,
            "bilstm_layers": word_bilstm_layers,
            "bilstm_dropout": word_bilstm_dropout,
            "freeze_word_emb": freeze_word_emb,
        }

        handshaking_kernel_config = {
            "shaking_type": hyper_parameters["shaking_type"],
            "context_type": hyper_parameters["context_type"],
            "visual_field": visual_field, # 
        }
        '''
        combined_hidden_size = 0
        self.char_encoder_config = char_encoder_config
        if char_encoder_config is not None:
            # char encoder
            char_size = char_encoder_config["char_size"]
            char_emb_dim = char_encoder_config["emb_dim"]
            char_emb_dropout = char_encoder_config["emb_dropout"]
            char_bilstm_hidden_size = char_encoder_config["bilstm_hidden_size"]
            char_bilstm_layers = char_encoder_config["bilstm_layers"]
            char_bilstm_dropout = char_encoder_config["bilstm_dropout"]
            max_char_num_in_subword = char_encoder_config[
                "max_char_num_in_tok"]
            self.char_emb = nn.Embedding(char_size, char_emb_dim)
            self.char_emb_dropout = nn.Dropout(p=char_emb_dropout)
            self.char_lstm_l1 = nn.LSTM(char_emb_dim,
                                        char_bilstm_hidden_size[0] // 2,
                                        num_layers=char_bilstm_layers[0],
                                        dropout=char_bilstm_dropout[0],
                                        bidirectional=True,
                                        batch_first=True)
            self.char_lstm_dropout = nn.Dropout(p=char_bilstm_dropout[1])
            self.char_lstm_l2 = nn.LSTM(char_bilstm_hidden_size[0],
                                        char_bilstm_hidden_size[1] // 2,
                                        num_layers=char_bilstm_layers[1],
                                        dropout=char_bilstm_dropout[2],
                                        bidirectional=True,
                                        batch_first=True)
            self.char_cnn = nn.Conv1d(char_bilstm_hidden_size[1],
                                      char_bilstm_hidden_size[1],
                                      max_char_num_in_subword,
                                      stride=max_char_num_in_subword)
            combined_hidden_size += char_bilstm_hidden_size[1]

        # word encoder
        ## init word embedding
        self.word_encoder_config = word_encoder_config
        if word_encoder_config is not None:
            word2idx = word_encoder_config["word2idx"]
            word_size = len(word2idx)
            word_emb_key = word_encoder_config["emb_key"]
            word_emb_dropout = word_encoder_config["emb_dropout"]
            word_bilstm_hidden_size = word_encoder_config["bilstm_hidden_size"]
            word_bilstm_layers = word_encoder_config["bilstm_layers"]
            word_bilstm_dropout = word_encoder_config["bilstm_dropout"]
            freeze_word_emb = word_encoder_config["freeze_word_emb"]

            print("Loading pretrained word embeddings...")
            if word_emb_key == "glove":
                glove_df = pd.read_csv(
                    '../../pretrained_emb/glove/glove.6B.100d.txt',
                    sep=" ",
                    quoting=3,
                    header=None,
                    index_col=0)
                pretrained_emb = {
                    key: val.values
                    for key, val in glove_df.T.items()
                }
                word_emb_dim = len(list(pretrained_emb.values())[0])
            elif word_emb_key == "pubmed":
                pretrained_emb = word2vec.load(
                    '../../pretrained_emb/bio_nlp_vec/PubMed-shuffle-win-30.bin'
                )
                word_emb_dim = len(pretrained_emb.vectors[0])
            init_word_embedding_matrix = np.random.normal(-0.5,
                                                          0.5,
                                                          size=(word_size,
                                                                word_emb_dim))
            hit_count = 0
            for word, idx in tqdm(word2idx.items(),
                                  desc="Init word embedding matrix"):
                if word in pretrained_emb:
                    hit_count += 1
                    init_word_embedding_matrix[idx] = pretrained_emb[word]
            print("pretrained word embedding hit rate: {}".format(hit_count /
                                                                  word_size))
            init_word_embedding_matrix = torch.FloatTensor(
                init_word_embedding_matrix)

            ## word encoder model
            self.word_emb = nn.Embedding.from_pretrained(
                init_word_embedding_matrix, freeze=freeze_word_emb)
            self.word_emb_dropout = nn.Dropout(p=word_emb_dropout)
            self.word_lstm_l1 = nn.LSTM(word_emb_dim,
                                        word_bilstm_hidden_size[0] // 2,
                                        num_layers=word_bilstm_layers[0],
                                        dropout=word_bilstm_dropout[0],
                                        bidirectional=True,
                                        batch_first=True)
            self.word_lstm_dropout = nn.Dropout(p=word_bilstm_dropout[1])
            self.word_lstm_l2 = nn.LSTM(word_bilstm_hidden_size[0],
                                        word_bilstm_hidden_size[1] // 2,
                                        num_layers=word_bilstm_layers[1],
                                        dropout=word_bilstm_dropout[2],
                                        bidirectional=True,
                                        batch_first=True)
            combined_hidden_size += word_bilstm_hidden_size[1]

        # bert
        self.bert_config = bert_config
        if bert_config is not None:
            bert_path = bert_config["path"]
            bert_finetune = bert_config["finetune"]
            self.use_last_k_layers_bert = bert_config["use_last_k_layers"]
            self.bert = AutoModel.from_pretrained(bert_path)
            if not bert_finetune:  # if train without finetuning bert
                for param in self.bert.parameters():
                    param.requires_grad = False


#             hidden_size = self.bert.config.hidden_size
            combined_hidden_size += self.bert.config.hidden_size

        # flair
        self.flair_config = flair_config
        if flair_config is not None:
            embedding_models = [
                FlairEmbeddings(emb_id)
                for emb_id in flair_config["embedding_ids"]
            ]
            self.stacked_flair_embeddings_model = StackedEmbeddings(
                embedding_models)
            combined_hidden_size += embedding_models[0].embedding_length * len(
                embedding_models)

        # encoding fc
        self.enc_fc = nn.Linear(combined_hidden_size, hidden_size)
        self.activate_enc_fc = activate_enc_fc

        # handshaking kernel
        shaking_type = handshaking_kernel_config["shaking_type"]
        context_type = handshaking_kernel_config["context_type"]
        visual_field = handshaking_kernel_config["visual_field"]
        self.handshaking_kernel = HandshakingKernel(hidden_size, shaking_type,
                                                    context_type, visual_field)

        # decoding fc
        self.dec_fc = nn.Linear(hidden_size, entity_type_num)

    def forward(self,
                char_input_ids=None,
                word_input_ids=None,
                padded_sents=None,
                subword_input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                subword2word_idx=None):

        # features
        features = []
        # char
        if self.char_encoder_config is not None:
            # char_input_ids: (batch_size, seq_len * max_char_num_in_subword)
            # char_input_emb/char_hiddens: (batch_size, seq_len * max_char_num_in_subword, char_emb_dim)
            # char_conv_oudtut: (batch_size, seq_len, char_emb_dim)
            char_input_emb = self.char_emb(char_input_ids)
            char_input_emb = self.char_emb_dropout(char_input_emb)
            char_hiddens, _ = self.char_lstm_l1(char_input_emb)
            char_hiddens, _ = self.char_lstm_l2(
                self.char_lstm_dropout(char_hiddens))
            char_conv_oudtut = self.char_cnn(char_hiddens.permute(0, 2,
                                                                  1)).permute(
                                                                      0, 2, 1)
            features.append(char_conv_oudtut)

        # word
        if self.word_encoder_config is not None:
            # word_input_ids: (batch_size, seq_len)
            # word_input_emb/word_hiddens: batch_size, seq_len, word_emb_dim)
            word_input_emb = self.word_emb(word_input_ids)
            word_input_emb = self.word_emb_dropout(word_input_emb)
            word_hiddens, _ = self.word_lstm_l1(word_input_emb)
            word_hiddens, _ = self.word_lstm_l2(
                self.word_lstm_dropout(word_hiddens))
            if self.bert_config is not None:
                # chose and repeat word hiddens, to align with subword num
                word_hiddens = torch.gather(
                    word_hiddens, 1,
                    subword2word_idx[:, :,
                                     None].repeat(1, 1,
                                                  word_hiddens.size()[-1]))
            features.append(word_hiddens)

        # flair embeddings
        if self.flair_config is not None:
            self.stacked_flair_embeddings_model.embed(padded_sents)
            flair_embeddings = torch.stack([
                torch.stack([tok.embedding for tok in sent])
                for sent in padded_sents
            ])
            if self.bert_config is not None:
                # chose and repeat word hiddens, to align with subword num
                flair_embeddings = torch.gather(
                    flair_embeddings, 1,
                    subword2word_idx[:, :,
                                     None].repeat(1, 1,
                                                  flair_embeddings.size()[-1]))
            features.append(flair_embeddings)

        if self.bert_config is not None:
            # subword_input_ids, attention_mask, token_type_ids: (batch_size, seq_len)
            context_oudtuts = self.bert(subword_input_ids, attention_mask,
                                        token_type_ids)
            # last_hidden_state: (batch_size, seq_len, hidden_size)
            hidden_states = context_oudtuts[2]
            subword_hiddens = torch.mean(torch.stack(
                list(hidden_states)[-self.use_last_k_layers_bert:], dim=0),
                                         dim=0)
            features.append(subword_hiddens)

        # combine features
        combined_hiddens = self.enc_fc(torch.cat(features, dim=-1))
        if self.activate_enc_fc:
            combined_hiddens = torch.tanh(combined_hiddens)

        # shaking_hiddens: (batch_size, shaking_seq_len, hidden_size)
        # shaking_seq_len: max_seq_len * vf - sum(1, vf)
        shaking_hiddens = self.handshaking_kernel(combined_hiddens)

        # ent_shaking_oudtuts: (batch_size, shaking_seq_len, entity_type_num)
        ent_shaking_oudtuts = self.dec_fc(shaking_hiddens)

        return ent_shaking_oudtuts
Ejemplo n.º 22
0
class EasyStackedEmbeddings:
    """ Word Embeddings that have been concatenated and "stacked" as specified by flair

    Usage:

    ```python
    >>> embeddings = adaptnlp.EasyStackedEmbeddings("bert-base-cased", "gpt2", "xlnet-base-cased")
    ```

    **Parameters:**

    * **&ast;embeddings** - Non-keyword variable number of strings specifying the embeddings you want to stack
    """
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                try:
                    self.embedding_stack.append(
                        WordEmbeddings(model_name_or_path))
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)

    def embed_text(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
    ) -> List[Sentence]:
        """ Stacked embeddings

        * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats
        **return** A list of Flair's `Sentence`s
        """
        # Convert into sentences
        if isinstance(text, str):
            sentences = [Sentence(text)]
        elif isinstance(text, list) and all(isinstance(t, str) for t in text):
            sentences = [Sentence(t) for t in text]
        elif isinstance(text, Sentence):
            sentences = [text]
        else:
            sentences = text

        # Unlike flair embeddings modules, stacked embeddings do not return a list of sentences
        self.stacked_embeddings.embed(sentences)
        return sentences
Ejemplo n.º 23
0
])

if 'pubmed' in args.model_name.lower():
    bert_embedding.tokenizer.basic_tokenizer.do_lower_case = False

flag = args.dataset
dataset = []
with open(f'./datasets/unified/train.{flag}.json') as f:
    dataset += json.load(f)
with open(f'./datasets/unified/valid.{flag}.json') as f:
    dataset += json.load(f)
with open(f'./datasets/unified/test.{flag}.json') as f:
    dataset += json.load(f)

bert_emb_dict = {}
for item in tqdm(dataset):
    tokens = tuple(item['tokens'])
    s = form_sentence(tokens)

    s.clear_embeddings()
    bert_embedding.embed(s)
    emb = get_embs(s)  # (T, 4*H)

    s.clear_embeddings()
    flair_embedding.embed(s)
    emb = np.concatenate([emb, get_embs(s)], axis=-1)

    bert_emb_dict[tokens] = emb.astype('float16')

with open(args.lm_emb_save_path, 'wb') as f:
    pickle.dump(bert_emb_dict, f)
Ejemplo n.º 24
0
        tokenIndicesToReturn = []
        if 'tokenIndicesToReturn' in sentenceTaggingRequest.keys():
            tokenIndicesToReturn = sentenceTaggingRequest[
                'tokenIndicesToReturn']
        if len(tokenIndicesToReturn) == 0:
            numReturnedVectors = numReturnedVectors + len(sentence)
        else:
            numReturnedVectors = numReturnedVectors + len(tokenIndicesToReturn)

    # 2. Write the number of vectors into the output
    ba.extend(pack('>i', numReturnedVectors))

    # Now compute the vectors
    # This does the actual embedding vector computation
    runtime = time.time()
    embeddings.embed(sentences)
    runtime = time.time() - runtime
    #print("flair embedding computation time:", runtime, file=sys.stderr)

    # 3. Get the vectorlength and write it into the output byte array
    vectorlength = len(sentences[0][0].embedding)
    doubleformat = '>' + 'd' * vectorlength
    ba.extend(pack('>i', vectorlength))

    # Now iterate through the sentences and write the output
    for i, sentenceTaggingRequest in enumerate(sentenceTaggingRequests):
        tokenIndicesToReturn = []
        if 'tokenIndicesToReturn' in sentenceTaggingRequest.keys():
            tokenIndicesToReturn = sentenceTaggingRequest[
                'tokenIndicesToReturn']
Ejemplo n.º 25
0
class LanguageModel(torch.nn.Module):
    """
    Fine-tune a language model via a binary classifier for identifying semantic language_model
    """
    def __init__(self, config):
        """
        Load pretrained language model
        """
        super(LanguageModel, self).__init__()
        embeddings_stack = []
        transformers = config.get("language_model", "transformers")
        if transformers is not "":
            transformers = transformers.split(";")
            for model in transformers:
                embeddings_stack.append(
                    TransformerWordEmbeddings(
                        model,
                        layers="-1",
                        pooling_operation='mean',
                        # use_scalar_mix=True,
                        fine_tune=True))
        word_embeddings = config.get("language_model", "word_embeddings")
        if word_embeddings is not "":
            word_embeddings = word_embeddings.split(";")
            for model in word_embeddings:
                embeddings_stack.append(WordEmbeddings(model))
        flair_embeddings = config.get("language_model", "flair_embeddings")
        if flair_embeddings is not "":
            flair_embeddings = flair_embeddings.split(";")
            for model in flair_embeddings:
                embeddings_stack.append(FlairEmbeddings(model, fine_tune=True))
        character_embeddings = config.get("language_model",
                                          "character_embeddigs")
        if character_embeddings.lower() is "yes":
            embeddings_stack.append(CharacterEmbeddings(character_embeddings))
        bytepair_embeddings = config.get("language_model",
                                         "bytepair_embeddings")
        if bytepair_embeddings.lower() is "yes":
            embeddings_stack.append(BytePairEmbeddings())
        custom_embeddings = config.get("language_model", "custom_embeddings")
        if custom_embeddings is not "":
            custom_embeddings = custom_embeddings.split(";")
            for path in custom_embeddings:
                embeddings_stack.append(WordEmbeddings(path))
        self.lm = StackedEmbeddings(embeddings_stack)
        self.embedding_dim = self.lm.embedding_length
        self.dropout = torch.nn.Dropout(
            float(config.get("language_model", "dropout")))
        self.classify = torch.nn.Linear(self.embedding_dim, 2)
        if config.get("language_model", "relu") == "yes":
            self.relu = torch.nn.ReLU()

    def forward(self, data):
        """
        Get contextualized embeddings for input sequence
        :return:
        """
        X = [Sentence(sent) for sent in data]
        self.lm.embed(X)
        # X = [sent[0] for sent in X]
        X = torch.stack([sentence[0].embedding for sentence in X])
        X = self.dropout(X)
        if self.relu is not None:
            X = self.relu(X)
        labels = self.classify(X)
        return labels

    def learn_relations(self):
        """
        Fine tune the language model via binary classification of relation identification, i.e. YES or NO answers.
        Data format is:
        [REL_NAME] <gloss of CONCEPT1 or example containing CONCEPT1> [SEP] CONCEPT <gloss of CONCEPT2 or gloss of CONCEPT1>
        e.g. [HYPONYM] feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats
             [SEP] cat any of various lithe-bodied roundheaded fissiped mammals, many with retractile claws
        :return:
        """

    def match_relation(self):
        """
        Determine whether an input sequence contains a relation of a particular kind
        :return:
        """

    def id_relation(self):
        """
Ejemplo n.º 26
0
class Model:

    modelpath = '/store/ai/model/'

    def __init__(self):

        # Sequence Tagging Model
        tagger_file = self.modelpath + 'tagger.pt'
        if Path(tagger_file).is_file():
            print('loading tagger from file')
            self.tagger = SequenceTagger.load_from_file(tagger_file)
        else:
            print('downloading pretrained tagger')
            self.tagger = SequenceTagger.load('ner-ontonotes')
            self.tagger.save(tagger_file)

        # Text Embedding Model
        embeddings_file = self.modelpath + 'embeddings.pickle'
        if Path(embeddings_file).is_file():
            print('loading embedder from file')
            filestream = open(embeddings_file, 'rb')
            self.embeddings = pickle.load(filestream)
        else:
            print('downloading pretrained embedders')
            self.embeddings = [
                # WordEmbeddings('glove'),
                FlairEmbeddings('multi-forward')
                # FlairEmbeddings('multi-backward')
            ]
            filestream = open(embeddings_file, 'wb')
            pickle.dump(self.embeddings, filestream)

        self.token_embedder = StackedEmbeddings(self.embeddings)
        self.doc_embedder = DocumentPoolEmbeddings(self.embeddings)

    def parse(self, text):

        sentence = Sentence(text)
        self.tagger.predict(sentence)
        self.token_embedder.embed(sentence)
        self.doc_embedder.embed(sentence)

        return sentence

    def mindmap(self, text):

        # parsing
        lines, arrows = parse_veclang(text)
        sentences = [self.parse(line) for line in lines]
        tensors = [s.get_embedding() for s in sentences]

        # tensor processing
        norm_tensors = normalize_tensors(tensors)
        flat_tensors = PCA(norm_tensors)

        # plot map
        filename = plot_embeddings(lines, flat_tensors, arrows)

        return f'Plotted mindmap to {filename}'

    def similarity(self, text):
        lines = text.split('//')
        sentences = [self.parse(line) for line in lines]
        vecs = [squeeze(s.embedding) for s in sentences]
        sim = dot(vecs[0], vecs[1]) / (norm(vecs[0]) * norm(vecs[1]))

        return f'the similarity is {sim}'
Ejemplo n.º 27
0
class FlairDataSet(Dataset):
    def __init__(self, data_path, encoding="latin1", reuse_emb=True):
        emb_path = os.path.join(os.path.dirname(data_path),
                                "last_computed_dataset.pt")

        getter = SentenceGetter(data_path, encoding)
        tokens = []
        labels = []

        self.labels = [[s[1] for s in sent] for sent in getter.sentences]
        self.tag_vals = list(set([l for labels in self.labels
                                  for l in labels]))
        self.tag2idx = {t: i for i, t in enumerate(self.tag_vals)}
        self.idx2tag = {v: k for k, v in self.tag2idx.items()}
        self.stacked_embeddings = None
        self.init_emb()

        if reuse_emb and os.path.isfile(emb_path):
            self.data = torch.load(emb_path)
            self._len = self.data.__len__()
            return

        self.stacked_embeddings = None
        self.init_emb()

        for i in range(len(getter.sentences)):
            pre_len, pre = 0, ''
            if i - 1 >= 0:
                pre_len = len(getter.sentences[i - 1])
                pre = ' '.join([s[0] for s in getter.sentences[i - 1]])

            sent_len = len(getter.sentences[i])
            sent = ' '.join([s[0] for s in getter.sentences[i]])

            next_len, next_s = 0, ''
            if i + 1 < len(getter.sentences):
                pre_len = len(getter.sentences[i - 1])
                pre = ' '.join([s[0] for s in getter.sentences[i - 1]])

            tokens += self.embed_sent(pre, pre_len, sent, sent_len, next_s,
                                      next_len)
            #labels += [s[1] for s in getter.sentences[i]]

        #self.tags = torch.Tensor([self.tag2idx.get(l) for l in labels])
        self.tags = torch.Tensor([[self.tag2idx.get(l) for l in lab]
                                  for lab in self.labels]).flatten()
        self.tokens = torch.cat(tokens)

        self.data = TensorDataset(self.tokens, self.tags)

        torch.save(self.data, emb_path)
        print('token size', self.tokens.size(), 'tags size', self.tags.size())
        print('Saved embeddings to ' + emb_path)

        self._len = len(self.labels)  # to check

    def __getitem__(self, index: int):
        return self.data[index]

    def __len__(self) -> int:
        return self._len

    def init_emb(self):
        # init standard GloVe embedding
        flair.device = torch.device("cpu")
        glove_embedding = WordEmbeddings('glove')

        # init Flair forward and backwards embeddings
        flair_embedding_forward = FlairEmbeddings('news-forward')
        flair_embedding_backward = FlairEmbeddings('news-backward')
        # create a StackedEmbedding object that combines glove and forward/backward flair embeddings
        self.stacked_embeddings = StackedEmbeddings([
            glove_embedding,
            flair_embedding_forward,
            flair_embedding_backward,
        ])

    def embed_sent(self, pre, pre_len, sent, sent_len, next_s, next_len):
        s = Sentence(' '.join([pre, sent, next_s]))
        self.stacked_embeddings.embed(s)
        assert len(s.tokens) == (pre_len + sent_len + next_len)
        return [
            tok.embedding.view(1, -1)
            for tok in s.tokens[pre_len:(pre_len + sent_len)]
        ]
Ejemplo n.º 28
0
class Encoder(object):
    def __init__(self, corpus, emb_path, flair=False):

        self.word2index, self.word_emb = self.get_pretrain_embeddings(emb_path, corpus.get_word_vocab())
        self.index2word = {i: w for w, i in self.word2index.items()}
        self.flair_words = None

        if config.if_flair or flair:
            # self.elmo = ELMoEmbeddings()
            # self.bert_embedding = BertEmbeddings('bert-base-cased')
            self.flair_forward_embedding = FlairEmbeddings('news-forward')
            self.flair_backward_embedding = FlairEmbeddings('news-backward')
            self.stacked_embeddings = StackedEmbeddings(
                embeddings=[self.flair_forward_embedding, self.flair_backward_embedding])

    def flair_encode(self, data):
        """Generate list of flair embeddings for each sentence in data"""
        sentences = [Sentence(' '.join(words)) for words in data]
        _ = [self.stacked_embeddings.embed(sentence) for sentence in tqdm(sentences)]
        corpus_embeddings = []
        for item in sentences:
            emb_seq = [token.embedding for token in item]
            corpus_embeddings.append(emb_seq)
        return corpus_embeddings

    def encode_words(self, corpus, flair=False):
        if not flair:
            corpus.train.words = [self.encode(self.word2index, sample) for sample in corpus.train.words]
            corpus.dev.words = [self.encode(self.word2index, sample) for sample in corpus.dev.words]
            corpus.test.words = [self.encode(self.word2index, sample) for sample in corpus.test.words]
        else:
            corpus.dev.embeddings = self.flair_encode(corpus.dev.words)
            corpus.train.embeddings = self.flair_encode(corpus.train.words)
            corpus.test.embeddings = self.flair_encode(corpus.test.words)
            return corpus

    def decode_words(self, corpus):
        corpus.train.words = [self.encode(self.index2word, sample) for sample in corpus.train.words]
        corpus.dev.words = [self.encode(self.index2word, sample) for sample in corpus.dev.words]
        corpus.test.words = [self.encode(self.index2word, sample) for sample in corpus.test.words]

    def encode(self, elem2index, elems):
        return [elem2index[elem] for elem in elems]

    @staticmethod
    def get_encoder(corpus, emb_path, encoder_pkl_path):
        if os.path.exists(encoder_pkl_path):
            encoder = Encoder.load(encoder_pkl_path)
        else:
            encoder = Encoder(corpus, emb_path)
            encoder.save(encoder_pkl_path)

        Encoder.print_stats(encoder)

        return encoder

    def print_stats(self):
        print('[LOG]')
        print("[LOG] Word vocab size: {}".format(len(self.word2index)))


    def save(self, filename):
        pickle.dump(self, open(filename, 'wb'))

    @staticmethod
    def load(filename):
        with open(filename, 'rb') as fp:
            return pickle.load(fp)



    def get_pretrain_embeddings(self, filename, vocab):
        assert len(vocab) == len(set(vocab)), "The vocabulary contains repeated words"

        w2i, emb = read_text_embeddings(filename)
        word2index = {'+pad+': 0, '+unk+': 1}
        embeddings = np.zeros((len(vocab) + 2, emb.shape[1]))

        scale = np.sqrt(3.0 / emb.shape[1])
        embeddings[word2index['+unk+']] = np.random.uniform(-scale, scale, (1, emb.shape[1]))

        perfect_match = 0
        case_match = 0
        no_match = 0

        for i in range(len(vocab)):
            word = vocab[i]
            index = len(word2index)  # do not use i because word2index has predefined tokens

            word2index[word] = index
            if word in w2i:
                embeddings[index] = emb[w2i[word]]
                perfect_match += 1
            elif word.lower() in w2i:
                embeddings[index] = emb[w2i[word.lower()]]
                case_match += 1
            else:
                embeddings[index] = np.random.uniform(-scale, scale, (1, emb.shape[1]))
                no_match += 1
        print("[LOG] Word embedding stats -> Perfect match: {}; Case match: {}; No match: {}".format(perfect_match,
                                                                                                     case_match,
                                                                                                     no_match))
        return word2index, embeddings
Ejemplo n.º 29
0
class FlairConnector:
    def __init__(self,
                 word_embedding_base: str = None,
                 document_embedding: str = None,
                 fine_tune: bool = False,
                 pretuned: bool = False):
        """

        :param word_embedding_base: - glove: 'glove', (only en), - fasttext: 'en', 'de'
        :param document_embedding:  pool vs rnn for w2v mode - bert: 'bert', 'bert-de'  - 'longformer' (only en) -
        'flair', 'stacked-flair', 'flair-de', 'stacked-flair-de'
        """
        # document embedding
        self.fine_tune = fine_tune
        self.document_embedding = None
        if word_embedding_base:
            self.word_embedding_base = WordEmbeddings(word_embedding_base)

            if document_embedding.lower() == 'pool':
                self.document_embedding = DocumentPoolEmbeddings(
                    [self.word_embedding_base])
            elif document_embedding.lower() == 'rnn':
                self.document_embedding = DocumentRNNEmbeddings(
                    [self.word_embedding_base])
            else:
                raise UserWarning(
                    f'{document_embedding} is not supported for combination with word embeedings'
                )
        elif document_embedding:
            print(document_embedding, pretuned)
            if pretuned:
                if document_embedding.lower(
                ) == 'bert' or document_embedding.lower() == 'bert-de':
                    self.document_embedding = SentenceTransformer(
                        'stsb-bert-large')
                    # self.document_embedding = SentenceTransformerDocumentEmbeddings('stsb-bert-large')
                elif document_embedding.lower() == 'roberta':
                    self.document_embedding = SentenceTransformer(
                        'stsb-roberta-large')
                    # self.document_embedding = SentenceTransformerDocumentEmbeddings('stsb-roberta-large')
                elif document_embedding.lower() == 'xlm':
                    self.document_embedding = SentenceTransformer(
                        'stsb-xlm-r-multilingual')
                    # self.document_embedding = SentenceTransformerDocumentEmbeddings('stsb-xlm-r-multilingual')
            else:
                if document_embedding.lower() == 'bert':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'bert-base-cased', fine_tune=fine_tune)
                elif document_embedding.lower() == 'bert-de':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'bert-base-german-cased', fine_tune=fine_tune)
                elif document_embedding.lower() == 'longformer':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'allenai/longformer-base-4096', fine_tune=fine_tune)
                elif document_embedding.lower() == 'xlnet':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'xlnet-base-cased', fine_tune=fine_tune)
                elif document_embedding.lower() == 'xlnet-de':
                    self.document_embedding = TransformerDocumentEmbeddings(
                        'xlm-mlm-ende-1024', fine_tune=fine_tune)
                elif document_embedding.lower() == 'flair':
                    self.document_embedding = FlairEmbeddings(
                        'en-forward', fine_tune=fine_tune)
                elif document_embedding.lower() == 'flair-de':
                    self.document_embedding = FlairEmbeddings(
                        'de-forward', fine_tune=fine_tune)
                elif document_embedding.lower() == 'stack-flair':
                    self.document_embedding = StackedEmbeddings([
                        FlairEmbeddings('en-forward'),
                        FlairEmbeddings('en-backward'),
                    ])
                elif document_embedding.lower() == 'stack-flair-de':
                    self.document_embedding = StackedEmbeddings([
                        FlairEmbeddings('de-forward'),
                        FlairEmbeddings('de-backward'),
                    ])
        else:
            raise UserWarning(f'No embeddings defined')

    def fine_tune(self):
        if isinstance(self.document_embedding, TransformerDocumentEmbeddings):
            corpus = TREC_6()
            label_dict = corpus.make_label_dictionary()
            classifier = TextClassifier(self.document_embedding,
                                        label_dictionary=label_dict)
            trainer = ModelTrainer(classifier, corpus, optimizer=Adam)

            # 6. start the training
            trainer.train(
                'resources/taggers/trec',
                learning_rate=3e-5,  # use very small learning rate
                mini_batch_size=16,
                mini_batch_chunk_size=
                4,  # optionally set this if transformer is too much for your machine
                max_epochs=5,  # terminate after 5 epochs
            )
        else:
            raise UserWarning(
                "No fine tuning for this embedding type implemented")

    def ft(self):
        if isinstance(self.document_embedding, LanguageModel):
            trainer = LanguageModelTrainer(self.document_embedding, corpus)
            trainer.train('resources/taggers/language_model',
                          sequence_length=100,
                          mini_batch_size=100,
                          learning_rate=20,
                          patience=10,
                          checkpoint=True)

    def embedd_document(self, document: str) -> Tensor:
        flair_doc = Sentence(document)

        self.document_embedding.embed(flair_doc)
        return flair_doc.get_embedding().detach().numpy()

    def embedd_document_p(self, document: str,
                          doc_id: str) -> Tuple[Tensor, str]:
        flair_doc = Sentence(document)

        self.document_embedding.embed(flair_doc)
        return flair_doc.get_embedding().detach().numpy(), doc_id

    def embedd_documents(self, documents: Union[FlairDocumentIterator,
                                                FlairFacetIterator]):
        parallel = False
        doc_bar = tqdm(documents,
                       total=len(documents),
                       desc="Flair Embedding",
                       disable=True)
        if parallel:
            num_cores = int(0.75 * multiprocessing.cpu_count())
            print(f"parralized on {num_cores} cores")
            result_tuples = Parallel(n_jobs=num_cores)(
                delayed(self.embedd_document_p)(document, doc_id)
                for doc_id, document in doc_bar)
            return {doc_id: doc_vec for (doc_vec, doc_id) in result_tuples}
        else:
            if isinstance(self.document_embedding, SentenceTransformer):
                embeddings = self.document_embedding.encode(
                    [document for doc_id, document in doc_bar],
                    batch_size=10,
                    show_progress_bar=True)
                doc_ids = (doc_id for doc_id, document in doc_bar)
                return {
                    doc_id: embedding
                    for embedding, doc_id in zip(embeddings, doc_ids)
                }
            else:
                # sentences = [Sentence(document) for doc_id, document in doc_bar]
                # self.document_embedding.embed(sentences)
                # doc_ids = (doc_id for doc_id, document in doc_bar)
                # return {doc_id: sentence.get_embedding().detach().numpy()
                #         for sentence, doc_id in zip(sentences, doc_ids)}
                # print(len(self.document_embedding.embeddings))
                return {
                    doc_id: self.embedd_document(document)
                    for doc_id, document in doc_bar
                }
Ejemplo n.º 30
0
class WSDModel(nn.Module):
    def __init__(self,
                 lang,
                 embeddings_dim,
                 embedding_weights,
                 hidden_dim,
                 hidden_layers,
                 dropout,
                 output_layers=["embed_wsd"],
                 lemma2synsets=None,
                 synsets2id={},
                 pos_tags={},
                 entity_tags={},
                 use_flair=False,
                 combine_WN_FN=False):
        super(WSDModel, self).__init__()
        self.use_flair = use_flair
        self.combine_WN_FN = combine_WN_FN
        self.output_layers = output_layers
        self.hidden_layers = hidden_layers
        self.hidden_dim = hidden_dim
        self.num_wsd_classes = 0
        self.synsets2id = synsets2id
        output_emb_dim = embeddings_dim
        if use_flair is True:
            if lang == "Bulgarian":
                # BG EMBEDDINGS:
                self.word_embeddings = StackedEmbeddings([
                    WordEmbeddings(
                        '/home/lenovo/dev/PostDoc/LREC/Embeddings/cc.bg.300.vec_FILTERED_OOV.gensim'
                    ),
                    # WordEmbeddings('bg'),
                    # FastTextEmbeddings('/home/lenovo/dev/PostDoc/LREC/Embeddings/cc.bg.300.vec_FILTERED_OOV'),
                    # Byte pair embeddings for English
                    BytePairEmbeddings('bg'),
                    FlairEmbeddings('bg-forward-fast'),
                    FlairEmbeddings('bg-backward-fast'),
                    CharacterEmbeddings()
                ])
            elif lang == "English":
                # EN EMBEDDINGS:
                self.word_embeddings = StackedEmbeddings([
                    WordEmbeddings(
                        '/home/lenovo/dev/word-embeddings/glove.6B/glove.6B.300d_MOD.gensim'
                    ),
                    WordEmbeddings(
                        '/home/lenovo/dev/word-embeddings/lemma_sense_embeddings/'
                        'WN30WN30glConOne-C15I7S7N5_200M_syn_and_lemma_WikipediaLemmatized_FILTERED.gensim'
                    ),
                    # WordEmbeddings('bg'),
                    # FastTextEmbeddings('/home/lenovo/dev/PostDoc/LREC/Embeddings/cc.bg.300.vec_FILTERED_OOV'),
                    # Byte pair embeddings for English
                    BytePairEmbeddings('en'),
                    FlairEmbeddings('en-forward-fast'),
                    FlairEmbeddings('en-backward-fast'),
                    CharacterEmbeddings()
                ])
            else:
                print("Unknown language!")
                exit(1)
            embeddings_dim = self.word_embeddings.embedding_length
        else:
            self.word_embeddings = nn.Embedding.from_pretrained(
                embedding_weights, freeze=True)
        self.lstm = nn.LSTM(embeddings_dim,
                            hidden_dim,
                            hidden_layers,
                            bidirectional=True,
                            batch_first=True,
                            dropout=dropout)
        if "embed_wsd" in self.output_layers:
            # We want output with the size of the lemma&synset embeddings
            self.emb_relu = nn.ReLU()
            self.output_emb = nn.Linear(2 * hidden_dim, output_emb_dim)
        if "embed_frameID" in self.output_layers:
            self.emb_relu_frames = nn.ReLU()
            self.output_emb_frames = nn.Linear(2 * hidden_dim, output_emb_dim)
        if "classify_wsd" in self.output_layers:
            if len(self.synsets2id) > 0:
                self.output_classify = nn.Linear(2 * hidden_dim,
                                                 len(self.synsets2id))
                self.num_wsd_classes = len(self.synsets2id)
            else:
                lemma2layers = collections.OrderedDict()
                for lemma, synsets in lemma2synsets.items():
                    lemma2layers[lemma] = nn.Linear(2 * hidden_dim,
                                                    len(synsets))
                    if len(synsets) > self.num_wsd_classes:
                        self.num_wsd_classes = len(synsets)
                self.classifiers = nn.Sequential(lemma2layers)
        if "pos_tagger" in self.output_layers:
            self.pos_tags = nn.Linear(2 * hidden_dim, len(pos_tags))
        if "ner" in self.output_layers:
            self.ner = nn.Linear(2 * hidden_dim, len(entity_tags))
        self.dropout = nn.Dropout(dropout)

    def forward(self, data, lemmas, source_ids=None):
        batch_layers = [layers[0] for layers in data['batch_layers']]
        if self.use_flair is True:
            X = data["sentence"]
            X = [Sentence(sent) for sent in X]
            self.word_embeddings.embed(X)
            X = [
                torch.stack([token.embedding for token in sentence])
                for sentence in X
            ]
            # pad_vector = torch.zeros(self.word_embeddings.embedding_length)
            X = pad_sequence(X, batch_first=True, padding_value=0.0)
        else:
            X = data["inputs"]
            X = self.word_embeddings(
                X)  # shape is [batch_size,max_length,embeddings_dim]
        X_lengths, mask, lemmas = data["length"], data["mask"], lemmas
        X = self.dropout(X)
        X = torch.nn.utils.rnn.pack_padded_sequence(X,
                                                    X_lengths,
                                                    batch_first=True,
                                                    enforce_sorted=False)
        X, _ = self.lstm(X)
        # pad_packed_sequence cuts the sequences in the batch to the greatest sequence length
        X, _ = torch.nn.utils.rnn.pad_packed_sequence(
            X, batch_first=True
        )  # shape is [batch_size, max_length_of_X, 2* hidden_layer]
        # Therefore, make sure mask has the same shape as X
        mask = mask[:, :X.shape[1]]  # shape is [batch_size, max_length_of_X]
        mask = torch.reshape(mask, (mask.shape[0], mask.shape[1], 1))
        X_wsd = torch.masked_select(X, mask)
        X_wsd = X_wsd.view(
            -1, 2 * self.hidden_dim)  # shape is [num_labels, 2*hidden_dim]
        outputs = {}
        for layer in self.output_layers:
            if layer == "embed_wsd" and layer in batch_layers:
                outputs["embed_wsd"] = self.dropout(
                    self.output_emb(self.emb_relu(X_wsd)))
            if layer == "embed_frameID" and layer in batch_layers:
                outputs["embed_frameID"] = self.dropout(
                    self.output_emb_frames(self.emb_relu_frames(X_wsd)))
            if layer == "classify_wsd" and layer in batch_layers:
                if len(self.synsets2id) > 0:
                    outputs["classify_wsd"] = self.dropout(
                        self.output_classify(X_wsd))
                else:
                    outputs_classif = []
                    for i, x in enumerate(torch.unbind(X_wsd)):
                        # lemma_pos = lemmas[i] + "-" + POS_MAP[pos[i]]
                        output_classif = self.dropout(
                            self.classifiers._modules[lemmas[i]](x))
                        outputs_classif.append(output_classif)
                    outputs_classif = pad_sequence(outputs_classif,
                                                   batch_first=True,
                                                   padding_value=-100)
                    outputs["classify_wsd"] = outputs_classif
            if layer == "pos_tagger" and layer in batch_layers:
                outputs["pos_tagger"] = pad_sequence(self.dropout(
                    self.pos_tags(X)),
                                                     batch_first=True,
                                                     padding_value=-100)
            if layer == "ner" and layer in batch_layers:
                outputs["ner"] = pad_sequence(self.dropout(self.ner(X)),
                                              batch_first=True,
                                              padding_value=-100)
        return outputs

    def forward_old(self, X, X_lengths, mask, pos_mask, lemmas):
        X = self.word_embeddings(
            X)  # shape is [batch_size,max_length,embeddings_dim]
        X = torch.nn.utils.rnn.pack_padded_sequence(X,
                                                    X_lengths,
                                                    batch_first=True,
                                                    enforce_sorted=False)
        X, _ = self.lstm(X)
        # pad_packed_sequence cuts the sequences in the batch to the greatest sequence length
        X, _ = torch.nn.utils.rnn.pad_packed_sequence(
            X, batch_first=True
        )  # shape is [batch_size, max_length_of_X, 2* hidden_layer]
        # Therefore, make sure mask has the same shape as X
        mask = mask[:, :X.shape[1]]  # shape is [batch_size, max_length_of_X]
        pos_mask = pos_mask[:, :X.
                            shape[1]]  # shape is [batch_size, max_length_of_X]
        # Make mask broadcastable to X
        mask = torch.reshape(mask, (mask.shape[0], mask.shape[1], 1))
        pos_mask = torch.reshape(pos_mask,
                                 (pos_mask.shape[0], pos_mask.shape[1], 1))
        # Select only RNN outputs that correspond to synset-tagged words in the data
        X_wsd = torch.masked_select(X, mask)
        # masked_select flattens the tensor, but we need it as matrix
        X_wsd = X_wsd.view(
            -1, 2 * self.hidden_dim)  # shape is [num_labels, 2*hidden_dim]
        # Select also the words to be POS tagged
        X_pos = torch.masked_select(X, pos_mask)
        X_pos = X_pos.view(-1, 2 * self.hidden_dim)
        outputs = {}
        for layer in self.output_layers:
            if layer == "embed_wsd":
                outputs["embed_wsd"] = self.dropout(self.output_emb(X_wsd))
            elif layer == "classify_wsd":
                if len(self.synsets2id) > 0:
                    outputs["classify_wsd"] = self.dropout(
                        self.output_classify(X_wsd))
                else:
                    outputs_classif = []
                    for i, x in enumerate(torch.unbind(X_wsd)):
                        # lemma_pos = lemmas[i] + "-" + POS_MAP[pos[i]]
                        output_classif = self.dropout(
                            self.classifiers._modules[lemmas[i]](x))
                        outputs_classif.append(output_classif)
                    outputs_classif = pad_sequence(outputs_classif,
                                                   batch_first=True,
                                                   padding_value=-100)
                    outputs["classify_wsd"] = outputs_classif
            elif layer == "pos_tagger":
                outputs["pos_tagger"] = self.dropout(self.pos_tags(X_pos))
        return outputs