def __init__(self,
                 folder='data/mmimdb-256/dataset-resized-256max',
                 split='dev',
                 image_transform=None):
        self.json_dir = os.path.join(folder, split, 'metadata')
        self.image_dir = os.path.join(folder, split, 'images')
        self.image_transform = image_transform
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.text_extractor = TextExtractor(
            folder + "/" + split + "/images/",
            split + "_" + "dataset_text_extract_output.txt", split)
        #insantiate a model to extract text

        # Category definitions of movies.
        self.categories = [
            'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
            'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
            'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
            'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show',
            'Thriller', 'War', 'Western'
        ]
        self.categories2ids = {
            category: id
            for (id, category) in enumerate(self.categories)
        }

        # Load JSON files.
        #print('Loading %s ...' % self.json_dir, end = '')
        print("extracting text and getting metadata")
        self.fdir = os.listdir(self.json_dir)
        self.metadata = [(fname[:-5],
                          json.load(open(os.path.join(self.json_dir, fname))))
                         for fname in sorted(self.fdir)
                         if not fname.startswith('.')]
        print(len(self.metadata))
        self.text_extractor.extract_text()

        print(' finished')

        # Pre-tokenizing all sentences.

        print('Tokenizing...', end='')
        self.tokenized_plots = list()
        for i in range(0, len(self.metadata)):
            text = self.text_extractor.get_item(
                i)  #self.metadata[i][1]['plot'][0]
            encoded_text = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                truncation=True,
                max_length=256,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt')
            self.tokenized_plots.append(encoded_text)
        print(' finished')
 def __init__(self, domain, api_user, api_token, blacklist_file,
              max_attachment_size, cache_location,
              start_date: datetime.date):
     self._cache_location = cache_location
     self._start_date = start_date
     self._domain = domain
     self._text_extractor = TextExtractor()
     self._repository = ConfluenceRepository(
         domain, api_user, api_token, max_attachment_size,
         self._text_extractor.supported_mime_types)
     self._secret_finder = SecretFinder(blacklist_file)
Exemple #3
0
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
import json
from text_extractor import TextExtractor
from document_analyzer import DocumentAnalyzer
from document_indexer import DocumentIndexer

document_indexer = DocumentIndexer()
document_analyzer = DocumentAnalyzer()
text_extractor = TextExtractor()


def handler(event, context):
    message = json.loads(event['Records'][0]['Sns']['Message'])

    jobId = message['JobId']
    print("JobId=" + jobId)

    status = message['Status']
    print("Status=" + status)

    if status != "SUCCEEDED":
        return {
            # TODO : handle error with Dead letter queue (not in this workshop)
            # https://docs.aws.amazon.com/lambda/latest/dg/dlq.html
 def setUp(self):
     self.text_extractor = TextExtractor(
         "https://scraping-for-beginner.herokuapp.com/login_page")
     self.text_extractor.login("imanishi", "kohei")
Exemple #5
0
from text_extractor import TextExtractor
from and_other_pattern_matcher import AndOtherPatternMatcher
from such_as_pattern_matcher import SuchAsPatternMatcher
from or_other_pattern_matcher import OrOtherPatternMatcher
from including_pattern_matcher import IncludingPatternMatcher
from especially_pattern_matcher import EspeciallyPatternMatcher
from text_extractor_pipe import TextExtractorPipe
from knowledge_graph import KnowledgeGraph
from matcher_pipe import MatcherPipe
import spacy

textExtractor1 = TextExtractor("WWII", "Q362")
textExtractor1.extract()
textExtractor2 = TextExtractor("London", "Q84")
textExtractor2.extract()
textExtractor3 = TextExtractor("Paris", "Q90")
textExtractor3.extract()
textExtractor4 = TextExtractor("World War I", "Q361")
textExtractor4.extract()
textExtractorPipe = TextExtractorPipe()
textExtractorPipe.addTextExtractor(textExtractor1)
textExtractorPipe.addTextExtractor(textExtractor2)
textExtractorPipe.addTextExtractor(textExtractor3)
textExtractorPipe.addTextExtractor(textExtractor4)

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(nlp.create_pipe('sentencizer'))  # updated
doc = nlp(textExtractorPipe.extract())

andOtherPatternMatcher = AndOtherPatternMatcher(nlp)
suchAsMatcher = SuchAsPatternMatcher(nlp)
from sty import fg, bg, ef, rs
from question_processor import QuestionProcessor
from text_extractor import TextExtractor
from text_extractor_pipe import TextExtractorPipe
from context_retriever import ContextRetriever
from answer_retriever import AnswerRetriever
from find_keywords import FindKeywords

# STEP 1: Extract keywords from the question
print(fg.green + "Please enter your question here: " + fg.rs)
question = input()
getKeywords = FindKeywords(question)
key_word = getKeywords.distill()

# STEP 2: Download text from wikipedia
textExtractor = TextExtractor(key_word, "1")
textExtractor.extract()
textExtractorPipe = TextExtractorPipe()
textExtractorPipe.addTextExtractor(textExtractor)

# STEP 3: Retrieve corpus from the text.
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('sentencizer')
doc = nlp(textExtractorPipe.extract())
sentences = [sent.text.strip() for sent in doc.sents]
questionProcessor = QuestionProcessor(nlp)
contextRetriever = ContextRetriever(nlp, 3)
questionContext = contextRetriever.getContext(sentences, questionProcessor.process(question))

# STEP 4: Retrieve answer from the corpus.
answerRetriever = AnswerRetriever()