Example #1
0
    def test_allocate(self):
        # Small data
        context = "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child."
        context_toks = tokenize(context)
        anchors = align(context, context_toks)
        query = ['Houston', ',', 'Texas']
        start_char = 19
        end_char = 32
        span = Span.allocate(anchors, start_char, end_char)
        self.assertEqual(span.start, 4)
        self.assertEqual(span.end, 6)
        for k in range(span.start, span.end + 1):
            self.assertEqual(context_toks[k], query[k - span.start])

        # Real data
        context = "Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\"."
        context_toks = tokenize(context)
        anchors = align(context, context_toks)
        query = ['Dangerously', 'in', 'Love']
        start_char = 505
        end_char = 523
        span = Span.allocate(anchors, start_char, end_char)
        self.assertEqual(span.start, 108)
        self.assertEqual(span.end, 110)
        for k in range(span.start, span.end + 1):
            self.assertEqual(context_toks[k], query[k - span.start])
Example #2
0
    def test_parse_json(self):
        train_json = json.load(open(SQUAD_JSON_FILE, 'rt'))
        para_js = train_json['data'][0]['paragraphs'][0]
        context = para_js['context']
        context_toks = tokenize(context)
        anchors = align(context, context_toks)
        answer_json = para_js['qas'][0]['answers']
        answers = Answer.parse_json(answer_json, context, context_toks,
                                    anchors)
        self.assertEqual(answers[0].span.start, 56)
        self.assertEqual(answers[0].span.end, 59)
        self.assertEqual(answers[0].answer_toks,
                         ['in', 'the', 'late', '1990s'])

        para_js = train_json['data'][0]['paragraphs'][3]
        context = para_js['context']
        context_toks = tokenize(context)
        anchors = align(context, context_toks)
        answer_json = para_js['qas'][8]['answers']
        answers = Answer.parse_json(answer_json, context, context_toks,
                                    anchors)
        # self.assertEqual(answers[0].span)

        self.assertIsNotNone(answers[0].span)
        print(answers[0].span)
Example #3
0
 def test_tokenize(self):
     self.assertEqual(
         tokenize(
             "What is the title of his first commercially successful work?"
         ), [
             'What', 'is', 'the', 'title', 'of', 'his', 'first',
             'commercially', 'successful', 'work', '?'
         ])
     self.assertEqual(tokenize("Rondo Op. 1."),
                      ['Rondo', 'Op', '.', '1', '.'])
Example #4
0
    def ask(self, context, query_text):
        vocab = self.data.vocab

        # Parse context
        raw_context = context
        context_toks = tokenize_long_text(context)
        context_toks = [t.strip(' ') for t in context_toks]
        context_chars = to_chars(context_toks, cf.WORD_LEN, cf.PAD_CHAR)
        contextw = vocab.vectorize(context_toks, cf.CONTEXT_LEN)
        contextc = vocab.vectorize_c(context_chars, cf.CONTEXT_LEN,
                                     cf.WORD_LEN)

        # Parse query
        q_toks = tokenize(query_text)
        queryw = vocab.vectorize(q_toks, cf.QUERY_LEN)
        question_chars = to_chars(q_toks, cf.WORD_LEN, cf.PAD_CHAR)
        queryc = vocab.vectorize_c(question_chars, cf.QUERY_LEN, cf.WORD_LEN)

        # Build input
        X_batch = [[np.array(contextw)], [np.array(queryw)],
                   [np.array(contextc)], [np.array(queryc)]]

        # Predict
        p1, p2, starts, ends = self.keras_model.predict_on_batch(X_batch)
        start = int(np.squeeze(starts, axis=-1)[0])
        end = int(np.squeeze(ends, axis=-1)[0])
        answer = [context_toks[i] for i in range(start, end + 1)]
        return answer
def main():

	translation_table = get_word_translations("100kword_trans.csv")
	translator = DirectTrans(translation_table)

	english = tokenize("data/100ktok.low.en")
	spanish = tokenize("data/100ktok.low.es")

	training_set, test_set, translated_set = get_datasets(english, spanish)

	test_output = open('trans_direct.txt','w')

	for i in range(len(test_set)):
		test_output.write(' '.join(translator.translate(test_set[i])) + "\n")

	test_output.close()
def main():

    english = tokenize("data/100ktok.low.en")
    spanish = tokenize("data/100ktok.low.es")

    training_set, test_set, translated_set = get_datasets(english, spanish)
    translations = get_word_translations("3000_trans.txt")
    search = BeamSearch(training_set, translations)

    test_output = open('trans_beam.txt','w')
    true_output = open('trans_true.txt','w')

    for i in range(len(test_set)):
        print "Translating sentence", i, "..."
        test_output.write(' '.join(search.translate(test_set[i])) + "\n")
        true_output.write(' '.join(translated_set[i]) + "\n")

    test_output.close()
    true_output.close()
Example #7
0
def get_document_abstract_norm(uri):

    doc = get_current(uri)

    bow = utilities.tokenize(doc['abstract'], max_sent=5)

    doc['abstract_norm'] = ' '.join(bow)
    doc['abstract_token'] = list(set([t for t in bow if len(t) > 5]))[:15]

    return doc
Example #8
0
def main():

    english = tokenize("data/100ktok.low.en")
    spanish = tokenize("data/100ktok.low.es")

    training_set, test_set, translated_set = get_datasets(english, spanish)
    translations = get_word_translations("3000_trans.txt")

    print "Original Sentence:", ' '.join(test_set[0])

    translator = DirectTrans(translations)
    print "Direct Translation:", ' '.join(translator.translate(test_set[0]))

    test_output = open('trans_beam.txt','w')
    true_output = open('trans_true.txt','w')

    search = BeamSearch(training_set, translations)
    print "Beam Translation:", ' '.join(search.translate(test_set[0]))
    print "True Translation:", ' '.join(translated_set[0])
Example #9
0
    def create(self, stopword_list=[], stemming_func=None):
        """
            Input Parameters:

            * stopword_list: a list of words to be removed from the collection. ["the","a","an","of"...]
            * stemming_func: a function for stemming.

            Output:
            * the following two files in disk.

            (1) docs.map:

            This file will contain a large number of lines with mapping of document names and their ids.
            Every line contains two numbers separated by a single comma: the document name and its internal id.

            Example:
            -------

            simple000001.txt,1
            simple001234.txt,2
            simple987654.txt,3
            ...
            ...


            (2) iindex.map:

            By now, this file contains a large number of lines, one for every single token found in the text and their
            occurences in the files of this collection.

            The output are posting lists for the collection that look like this:

            wordA,doc1:freq1,doc2:freq2,doc3:freq3....
            wordB,doc3:freq3,doc5:freq5,doc6:freq6....

        """

        for f in self.files:
            text = open_file(f)
            tokens = tokenize(text, stopword_list, stemming_func)
            # ...
            # ....
            # ....
            pass

        # Save to document mappings to disk:
        with open("docs.map","w") as fout:
            #fout.write(...)
            pass

        # Save iindex to disk:
        with open("iindex.map","w") as fout:
            #fout.write(...)
            pass
Example #10
0
def home(request):
    data = {'title': "pdf converter"}
    if request.method == 'GET':
        return render(request, 'pdftotext/home.html', data)
    elif request.method == 'POST':
        data['pdf-to-text'] = ""
        data['is_converted'] = False
        if request.FILES["pdf-file"]:
            pdf = request.FILES["pdf-file"]
            keywords = request.POST.get("key-words")
            pdfText = PdfText(pdf=pdf)
            pdfText.save()
            pdfPath = pdfText.pdf.path
            text = pdf_to_text(pdfPath)
            data['pdf_to_text'] = text
            data['is_converted'] = True

            global IS_CASE_SENSITIVE
            if not IS_CASE_SENSITIVE :
                text = text.lower()
                keywords = keywords.lower()
            
            histogramDict = histogram(tokenize(text))
            keywords = tokenize(keywords)
            keywordCount = dict()
            for keyword in keywords:
                count = get_count(keyword, histogramDict)
                keywordCount[keyword] = count
            data['keywords'] = keywordCount



            # # start
            print "------------------------------------"
            # hist = histogram(tokenize(histogram))
            # for key, value in histogramDict.items():
            #   print str(key) + ":" + str(value)
            # # end

        return render(request, 'pdftotext/home.html', data)
Example #11
0
    def parse_json(cls, answers_js: List[dict], context: str,
                   context_toks: List[str], anchors: List[int]):
        answers = []
        for ans in answers_js:
            ans_text = ans['text']
            ans_start = ans['answer_start']
            ans_toks = tokenize(ans_text)

            # Identify the span from context, ans_text & start index
            span = Span.allocate(anchors, ans_start,
                                 ans_start + len(ans_text) - 1)
            answers.append(Answer(ans_text, ans_toks, span, ans_start))
        return answers
Example #12
0
def tokenize_and_vectorize(sentence, vector_dictionary):
    """Return tokens and vector for sentence"""
    tokens = tokenize(sentence)
    pos = tag(tokens)
    length = len(tokens)
    tokens = remove_stop_words(tokens)
    vector = sentence_vector(tokens, vector_dictionary)
    if vector is DO_NOT_INCLUDE:
        return vector
    return {
        'sentence': sentence,
        'tokens': tokens,
        'sentence_vec': vector,
        'pos': pos,
        'length': length
    }
Example #13
0
    def parse_json(cls, para_js: dict, para_idx: int):
        # Accumulate all answers' tokens first
        all_para_answers = []
        for q in para_js['qas']:
            if 'answers' in q:
                all_para_answers.extend([ans for ans in q['answers']])
            if 'plausible_answers' in q:
                all_para_answers.extend(
                    [ans for ans in q['plausible_answers']])

        # Improve the context for better tokenization
        raw_context = para_js['context']
        # context = augment_long_text(para_js['context'], all_para_answers)
        context = raw_context

        context_toks = tokenize_long_text(context)
        context_toks = [t.strip(' ') for t in context_toks]
        anchors = align(raw_context, context_toks)
        questions = []
        for q in para_js['qas']:
            question_text = q['question']
            q_toks = tokenize(question_text)
            ques_id = q['id']
            answers = Answer.parse_json(q['answers'], raw_context,
                                        context_toks,
                                        anchors) if 'answers' in q else []
            plausible_answers = Answer.parse_json(
                q['plausible_answers'], raw_context, context_toks,
                anchors) if 'plausible_answers' in q else []
            questions.append(
                Question(question_text, ques_id, q_toks, answers,
                         plausible_answers))

        para = Paragraph(raw_context, context, context_toks, questions,
                         para_idx, anchors)
        for ques in questions:
            ques.set_paragraph(para)
        return para
Example #14
0
def transform(record, uri):
    '''
    Extract the relevant data and return a Solr document dict.
    '''
    document = {}

    # The main DBpedia URI as document id
    document['id'] = uri

    # Other language, Wikidata uris
    if uri.startswith('http://nl.'):
        document['uri_nl'] = uri
    else:
        document['uri_en'] = uri

    if PROP_SAME_AS in record:
        for u in record[PROP_SAME_AS]:
            if uri.startswith('http://nl.'):
                if u.startswith('http://dbpedia.org/resource/'):
                    document['uri_en'] = u
            if u.startswith('http://www.wikidata.org/entity/'):
                document['uri_wd'] = u

    # The first (i.e. Dutch if available) label
    document['label'] = record[PROP_LABEL][0]

    # Normalized pref label, based on the label without specification
    # between brackets
    pref_label = utilities.normalize(remove_spec(document['label']))
    document['pref_label'] = pref_label
    document['pref_label_str'] = pref_label

    # The first (i.e. Dutch if available) abstract
    try:
        document['abstract'] = record[PROP_ABSTRACT][0]
    except Exception as e:
        document['abstract'] = '.'

    bow = utilities.tokenize(document['abstract'], max_sent=5)
    document['abstract_norm'] = ' '.join(bow)
    document['abstract_token'] = list(set([t for t in bow if len(t) > 5]))[:15]

    # Language of the (primary) resource description
    document['lang'] = 'nl' if uri.startswith('http://nl.') else 'en'

    # Number of links and inlinks (max of Dutch and English counts)
    if PROP_LINK in record:
        document['outlinks'] = len(record[PROP_LINK])

    document['inlinks'] = max(record['inlinks'])

    # Number of times label appears in newspaper index
    document['inlinks_newspapers'] = ddd_jsru(pref_label)

    # Set ambiguity flag if specification between brackets present in URI and
    # save the specification
    if '_(' in uri and uri.endswith(')'):
        document['ambig'] = 1
        document['spec'] = utilities.normalize(uri_to_string(uri, True))
    else:
        document['ambig'] = 0

    # Normalized alt labels extracted form various name fields as well as
    # redirects
    cand = record[PROP_LABEL][1:]
    cand += record[PROP_NAME]

    if PROP_REDIRECT in record:
        # Exclude English redirects if there are too many
        if len([
                u for u in record[PROP_REDIRECT]
                if u.startswith('http://dbpedia.org/resource/')
        ]) > 100:
            cand += [
                uri_to_string(u) for u in record[PROP_REDIRECT]
                if u.startswith('http://nl.dbpedia.org/resource/')
            ]
        else:
            cand += [uri_to_string(u) for u in record[PROP_REDIRECT]]

    # Include disambiguations for acronyms
    if PROP_DISAMBIGUATES in record:
        for u in record[PROP_DISAMBIGUATES]:
            s = uri_to_string(u)
            if len(s) >= 2 and len(s) <= 5 and s.isupper():
                cand.append(s)

    # Include Wikidata aliases
    if document.get('uri_wd'):
        wd_cand = get_wd_aliases(document.get('uri_wd'))
        cand += wd_cand

        wd_alt_label = clean_labels(wd_cand, pref_label)
        document['wd_alt_label'] = wd_alt_label
        document['wd_alt_label_str'] = wd_alt_label

    alt_label = clean_labels(cand, pref_label)
    document['alt_label'] = alt_label
    document['alt_label_str'] = alt_label

    # Keywords extracted from Dutch DBpedia category links, e.g.
    # http://nl.dbpedia.org/resource/Categorie:Amerikaans_hoogleraar
    # should return ['amerikaans', 'hoogleraar']
    if PROP_LINK in record:
        keywords = []
        for link in record[PROP_LINK]:
            if link.startswith('http://nl.dbpedia.org/resource/Categorie:'):
                s = uri_to_string(link).split('Categorie:')[1]
                # Crude stop word filtering. Use list instead?
                keywords += [
                    k for k in utilities.normalize(s).split() if len(k) >= 5
                ]
        keywords = list(set(keywords))
        for k in pref_label.split():
            if k in keywords:
                keywords.remove(k)
        document['keyword'] = keywords

    # DBpedia ontology and schema.org types
    if PROP_TYPE in record:
        document['dbo_type'] = list(
            set([
                t.split('/')[-1] for t in record[PROP_TYPE]
                if t.startswith('http://dbpedia.org/ontology/')
                and t.find('Wikidata:') < 0 and t.find('%') < 0
            ]))
        document['schema_type'] = list(
            set([
                t.split('/')[-1] for t in record[PROP_TYPE]
                if t.startswith('http://schema.org/')
            ]))

    # Predicted topics and types
    resp = requests.get(TOPICS_URL, params={'url': uri}, timeout=300)
    if resp.status_code != 200:
        raise Exception('Error retrieving topics')

    resp = resp.json()

    for t in resp['topics']:
        document['topic_{}'.format(t)] = resp['topics'][t]

    for t in resp['types']:
        document['dbo_type_{}'.format(t)] = resp['types'][t]

    # Probable last name, for persons only
    if (('dbo_type' in document and 'Person' in document['dbo_type']) or
        ('dbo_type' not in document and document['dbo_type_person'] >= 0.75)):
        last_part = utilities.get_last_part(pref_label,
                                            exclude_first_part=True)
        if last_part:
            document['last_part'] = last_part
            document['last_part_str'] = last_part

    # Birth and death dates, taking the minimum of multiple birth date options
    # and the maximum of multiple death dates
    # E.g. -013-10-07+01:00
    if PROP_BIRTH_DATE in record:
        cand = []
        for date in record[PROP_BIRTH_DATE]:
            try:
                cand.append(int(date[:4]))
            except Exception as e:
                continue
        if cand:
            document['birth_year'] = min(cand)

    if PROP_DEATH_DATE in record:
        cand = []
        for date in record[PROP_DEATH_DATE]:
            try:
                cand.append(int(date[:4]))
            except Exception as e:
                continue
        if cand:
            document['death_year'] = max(cand)

    # Birth and death places, giving preference to Dutch options
    nl_resource = 'http://nl.dbpedia.org/resource/'
    en_resource = 'http://dbpedia.org/resource/'

    if PROP_BIRTH_PLACE in record:
        places = [
            utilities.normalize(uri_to_string(p))
            for p in record[PROP_BIRTH_PLACE] if p.startswith(nl_resource)
        ]
        if not places:
            places = [
                utilities.normalize(uri_to_string(p))
                for p in record[PROP_BIRTH_PLACE] if p.startswith(en_resource)
            ]
        document['birth_place'] = list(set(places))

    if PROP_DEATH_PLACE in record:
        places = [
            utilities.normalize(uri_to_string(p))
            for p in record[PROP_DEATH_PLACE] if p.startswith(nl_resource)
        ]
        if not places:
            places = [
                utilities.normalize(uri_to_string(p))
                for p in record[PROP_DEATH_PLACE] if p.startswith(en_resource)
            ]
        document['death_place'] = list(set(places))

    # OCR tolerant labels
    if 'pref_label' in document:
        pref_label_ocr = utilities.normalize_ocr(document['pref_label'])
        document['pref_label_ocr'] = pref_label_ocr
        document['pref_label_str_ocr'] = pref_label_ocr

    if 'alt_label' in document:
        alt_label_ocr = [
            utilities.normalize_ocr(label) for label in document['alt_label']
        ]
        document['alt_label_ocr'] = alt_label_ocr
        document['alt_label_str_ocr'] = alt_label_ocr

    if 'last_part' in document:
        last_part_ocr = utilities.normalize_ocr(document['last_part'])
        document['last_part_ocr'] = last_part_ocr
        document['last_part_str_ocr'] = last_part_ocr

    # Vectors
    # Wikidata
    if 'uri_wd' in document:
        payload = {'source': document['uri_wd'].split('/')[-1]}
        response = requests.get(W2V_URL, params=payload, timeout=300)
        data = response.json()
        if data['vectors']:
            data = [float('{0:.3f}'.format(f)) for f in data['vectors'][0]]
            document['vector'] = json.dumps(data)

    # Abstract and keyword tokens
    tokens = []
    if 'abstract_token' in document:
        tokens.extend(document['abstract_token'])
    if 'keyword' in document:
        tokens.extend(document['keyword'])

    if tokens:
        if 'pref_label' in document:
            tokens = [
                t for t in tokens if t not in document['pref_label'].split()
            ]
        tokens = [
            t for t in tokens if t not in dictionary.unwanted and len(t) >= 5
        ]

        payload = {'source': ' '.join(list(set(tokens)))}
        response = requests.get(W2V_URL, params=payload, timeout=300)
        data = response.json()['vectors']
        if data:
            document['abstract_vector'] = [
                json.dumps([float('{0:.3f}'.format(f)) for f in v])
                for v in data
            ]

    return document
def merge_otherft(sentences_set):
	for sent in sentences_set:
		sent.tokens= tokenize(sent.content) # tokenize all sentences 
		if sent.labeled_aspects == 'other features':
			sent.labeled_aspects = 'no feature'  # convert all other features to no feature
Example #16
0
#! /usr/bin/env python3

import os, sys, time, re
from utilities import tokenize, runProgram, exitShell
from utilities import changeDirectory
""" The following is based on:
	-) https://github.com/robustUTEP/os-demos/blob/master/ch5-api/p3-exec.py
	-) https://brennan.io/2015/01/16/write-a-shell-in-c/
"""

# set the status an impossibly large number as an initial start
status = sys.maxsize
args = ''

while True:
    args = tokenize(' ', input("$ "))

    if args[0] == 'exit':
        exitShell(args)

    elif args[0] == 'pwd':
        # print the working directory
        os.write(1, (os.getcwd() + '\n').encode())
        continue

    elif args[0] == 'cd':
        try:
            os.chdir(args[1])
        except:
            os.write(2, ("path not valid: %s\n" % args[1]).encode())
        continue
Example #17
0
 def prepare_word_list(self):
     """Create words lists from documents"""
     print "pre-processing text..."
     self.texts = [[word.lower() for word in utils.tokenize(document) if word not in utils.SW] for document in self.documents]
     self.word_counts = [float(len(text)) for text in self.texts]