コード例 #1
0
        def render(item, page_number):

            if isinstance(item, LTPage) or isinstance(item, LTTextBox):
                for child in item:
                    render(child, page_number)
            elif isinstance(item, LTTextLine):
                child_str = ''
                for child in item:
                    if isinstance(child, (LTChar, LTAnno)):
                        child_str += child.get_text()
                child_str = ' '.join(child_str.split()).strip()
                if child_str:
                    row = (page_number, item.bbox[0], item.bbox[1],
                           item.bbox[2], item.bbox[3], child_str
                           )  # bbox == (x1, y1, x2, y2)
                    #  HACK
                    #check if it is outline page
                    if ('contents' in child_str.lower()):
                        #print("found", child_str.lower())
                        self.outline = True
                        self.outline = 1

                    if self.outline:

                        if ('agricultur' in child_str.lower()) or ('health' in child_str.lower())\
                           or ('social' in child_str.lower()) or ('schedule' in child_str.lower())\
                            or ('labour' in child_str.lower()) or ('revenue' in child_str.lower())\
                            or ('amendment' in child_str.lower()) or ('cancellation' in child_str.lower())\
                            or ('extension' in child_str.lower()) or ('correction' in child_str.lower()) \
                            or ('trade' in child_str.lower()) or ('industry' in child_str.lower())\
                            or ('specification' in child_str.lower()) or ('customs' in child_str.lower())\
                            or ('renewal' in child_str.lower()) or ('agreement' in child_str.lower())\
                            or ('education' in child_str.lower()) or ('regulation' in child_str.lower())\
                            or ('registration' in child_str.lower()) or ('nurse' in child_str.lower())\
                            or ('auxiliary' in child_str.lower()) or ('student' in child_str.lower())\
                            or ('benefit' in child_str.lower()) or ('act' in child_str.lower()):

                            #self.interesting_text.append(child_str.lower().rsplit()) # split by words
                            # strip '...'
                            entry = child_str.lower().replace(".", "").strip()
                            language, ratio = detect_language(entry)
                            if language == 'english':

                                self.interesting_text.append(entry)

                            elif ratio['english'] > 0:  # some Eng words

                                self.aux_text.append(entry)
                    # end HACK

                    self.rows.append(row)

                for child in item:
                    render(child, page_number)

            return
コード例 #2
0
ファイル: classes.py プロジェクト: AnnaMag/pdf-hacks
        def render(item, page_number):

            if isinstance(item, LTPage) or isinstance(item, LTTextBox):
                for child in item:
                    render(child, page_number)
            elif isinstance(item, LTTextLine):
                child_str = ''
                for child in item:
                    if isinstance(child, (LTChar, LTAnno)):
                        child_str += child.get_text()
                child_str = ' '.join(child_str.split()).strip()
                if child_str:
                    row = (page_number, item.bbox[0], item.bbox[1], item.bbox[2], item.bbox[3], child_str) # bbox == (x1, y1, x2, y2)
                    #  HACK
                    #check if it is outline page
                    if ('contents' in child_str.lower()):
                        #print("found", child_str.lower())
                        self.outline = True
                        self.outline = 1

                    if self.outline:

                        if ('agricultur' in child_str.lower()) or ('health' in child_str.lower())\
                           or ('social' in child_str.lower()) or ('schedule' in child_str.lower())\
                            or ('labour' in child_str.lower()) or ('revenue' in child_str.lower())\
                            or ('amendment' in child_str.lower()) or ('cancellation' in child_str.lower())\
                            or ('extension' in child_str.lower()) or ('correction' in child_str.lower()) \
                            or ('trade' in child_str.lower()) or ('industry' in child_str.lower())\
                            or ('specification' in child_str.lower()) or ('customs' in child_str.lower())\
                            or ('renewal' in child_str.lower()) or ('agreement' in child_str.lower())\
                            or ('education' in child_str.lower()) or ('regulation' in child_str.lower())\
                            or ('registration' in child_str.lower()) or ('nurse' in child_str.lower())\
                            or ('auxiliary' in child_str.lower()) or ('student' in child_str.lower())\
                            or ('benefit' in child_str.lower()) or ('act' in child_str.lower()):

                            #self.interesting_text.append(child_str.lower().rsplit()) # split by words
                            # strip '...'
                            entry = child_str.lower().replace(".","").strip()
                            language, ratio = detect_language(entry)
                            if language == 'english':

                                self.interesting_text.append(entry)

                            elif ratio['english'] > 0: # some Eng words

                                self.aux_text.append(entry)
                    # end HACK

                    self.rows.append(row)

                for child in item:
                    render(child, page_number)

            return
コード例 #3
0
ファイル: word2vec.py プロジェクト: alfredolainez/deep-nlp
def tokenize_document(docpair, use_nltk=True):
    print 'working on doc {}'.format(docpair[0])
    if not use_nltk:
        if FILTER_ENGLISH:
            return [
                x.lower_.encode('ascii', errors='ignore')
                for x in nlp(docpair[1]) if detect_language(x) == 'english'
            ]
        return [
            x.lower_.encode('ascii', errors='ignore') for x in nlp(docpair[1])
        ]
    else:
        if FILTER_ENGLISH:
            return [
                x.encode('ascii', errors='ignore').lower()
                for x in word_tokenize(docpair[1])
                if detect_language(x) == 'english'
            ]
        return [
            x.encode('ascii', errors='ignore').lower()
            for x in word_tokenize(docpair[1])
        ]
コード例 #4
0
ファイル: __init__.py プロジェクト: jigarpshah/maze
def track_generator(languages, emotion, seeds, year_range, recreate=False):
    station = Pandora_Station.get_or_create(languages, emotion, seeds, recreate)
    for track in station.get_playlist():
        try:
            artist, song_name = track['artistName'], track['songName']
        except KeyError:
            continue
        detected_language = detect_language(song_name)
        if detected_language in languages:
            rdio_track = Rdio_Track.search(artist, song_name)
            if rdio_track is not None:
                release_year = rdio_track.get_release_date().year
                if release_year in year_range:
                    print u"Found {}".format(rdio_track)
                    yield rdio_track
                else:
                    print u"Release year {} outside range for {}".format(release_year, song_name)
            else:
                print u"No rdio version of {}".format(song_name)
        else:
            print u"Wrong language {} for {}".format(detected_language, song_name)
    if not recreate:
        for rdio_track in track_generator(languages, emotion, seeds, year_range, recreate=True):
            yield rdio_track
コード例 #5
0
ファイル: word2vec.py プロジェクト: SemanticPrincess/deep-nlp
def tokenize_document(docpair, use_nltk=True):
    print 'working on doc {}'.format(docpair[0])
    if not use_nltk:
        if FILTER_ENGLISH:
            return [x.lower_.encode('ascii',errors='ignore') for x in nlp(docpair[1]) if detect_language(x) == 'english']
        return [x.lower_.encode('ascii',errors='ignore') for x in nlp(docpair[1])]
    else:
        if FILTER_ENGLISH:
            return [x.encode('ascii',errors='ignore').lower() for x in word_tokenize(docpair[1]) if detect_language(x) == 'english']
        return [x.encode('ascii',errors='ignore').lower() for x in word_tokenize(docpair[1])]
コード例 #6
0
def get_reviews_data(partitions_to_use, pickle_base_name):
    """
    Gets loaded json data in pickles and returns fields of interest
    """

    data = load_partitions(partitions_to_use, pickle_base_name)
    review_texts = []
    useful_votes = []
    funny_votes = []
    cool_votes = []
    review_stars = []

    for review in data:
        review_texts.append(review['text'])
        useful_votes.append(review['votes']['useful'
        cool_votes.append(review['votes']['cool'])
        funny_votes.append(review['votes']['funny'])
        review_stars.append(review['stars'])

    return review_texts, useful_votes, funny_votes, cool_votes, review_stars

def give_balanced_classes(reviews, funny_votes):
    """
    From all the reviews and votes given, partitions the data into two classes: funny reviews and not
    funny reviews.
    All the funny reviews found are returned. The method is assuming majority of not funny votes.
    The same number of not funny reviews are returned, randomly selected.
    Returned data is a shuffled balanced set of funny and not funny reviews.
    """

    # We will consider a review to be funny if it has 3 or more funny votes.
    # Not funny reviews have 0 votes.
    VOTES_THRESHOLD = 3
    not_funny_reviews_indices = []

    # Find all the funny reviews we can
    final_reviews = []
    final_labels = []
    for i, review in enumerate(reviews):
        if funny_votes[i] >= VOTES_THRESHOLD:
            final_reviews.append(review)
            final_labels.append(1)
        elif funny_votes[i] == 0:
            not_funny_reviews_indices.append(i)

    # We want balanced classes so take same number
    np.random.shuffle(not_funny_reviews_indices)
    num_funny_reviews = len(final_reviews)
    for i in range(num_funny_reviews):
        final_reviews.append(reviews[not_funny_reviews_indices[i]])
        final_labels.append(0)

    # Shuffle final reviews and labels
    combined_lists = zip(final_reviews, final_labels)
    np.random.shuffle(combined_lists)
    final_reviews[:], final_labels[:] = zip(*combined_lists)

    print "Returning %d funny reviews and a total of %d reviews" % (num_funny_reviews, len(final_reviews))

    return (final_reviews, final_labels)

def create_data_sets(partition_list=range(1,100), pickle_base_name=DEFAULT_REVIEWS_PICKLE + '.'):
    """
    Creates a 50% - 25% - 25% train/validation/test partition of the classification problem. Classes are balanced.
    It reads the list of partitions saved in pickles.
    Resulting data sets are saved as python pickles.
    """

    load_partitions(partition_list, pickle_base_name)
    reviews, _, funny_votes, _, _ = get_reviews_data(partition_list, pickle_base_name)
    reviews, labels = give_balanced_classes(reviews, funny_votes)
    N = len(reviews)

    train_reviews = reviews[:N/2]
    train_labels = labels[:N/2]

    dev_reviews = reviews[N/2:3*N/4]
    dev_labels = labels[N/2:3*N/4]

    test_reviews = reviews[3*N/4:]
    test_labels = labels[3*N/4:]

    pickle.dump([train_reviews, train_labels],
                open("TrainSet_" + str(N), "wb"), pickle.HIGHEST_PROTOCOL)

    pickle.dump([dev_reviews, dev_labels],
                open("DevSet_" + str(N), "wb"), pickle.HIGHEST_PROTOCOL)

    pickle.dump([test_reviews, test_labels],
                open("TestSet_" + str(N), "wb"), pickle.HIGHEST_PROTOCOL)


def accept_only_english(json_review):
    # Short texts are hard to classify in any language, so they will be accepted
    if len(json_review['text']) <= 150:
        return True
    else:
        return language.detect_language(json_review['text']) == 'english'