def test_isint_returns_False_if_given_float_string_padded_or_not(x, y, z):
    assume(not math.isnan(x))
    assume(not math.isinf(x))
    y = ''.join(repeat(' ', y)) + repr(x) + ''.join(repeat(' ', z))
    assert not fastnumbers.isint(repr(x))
    assert not fastnumbers.isint(y)
    for base in range(2, 36 + 1):
        if len(y) < 30:  # Avoid recursion error because of overly simple baseN function.
            assert not fastnumbers.isint(y, base=base)
Ejemplo n.º 2
0
def load_data_for_nn():
    data = pd.read_csv(os.path.join(DIR_TRAIN, 'train_set.csv'),
                       usecols=range(1, 11),
                       parse_dates=['timestamp', 'thread_timestamp'])
    data = data[data.channel.isin([
        'career', 'big_data', 'deep_learning', 'kaggle_crackers',
        'lang_python', 'lang_r', 'nlp', 'theory_and_practice', 'welcome',
        'bayesian', '_meetings', 'datasets'
    ]) & data.main_msg]

    # data_train = data.
    date_before = date(2017, 4, 1)
    train = data[data['timestamp'] <= date_before]
    val = data[data['timestamp'] > date_before]

    train_data = train[['channel', 'text']].reset_index()[['channel', 'text']]
    train_data['channel'] = train_data.channel.map(MAPPINGS)
    train_data = train_data.sort_values('channel').reset_index()[[
        'channel', 'text'
    ]]

    val_data = val[['channel', 'text']].reset_index()[['channel', 'text']]
    val_data['channel'] = val_data.channel.map(MAPPINGS)
    val_data = val_data.sort_values('channel').reset_index()[[
        'channel', 'text'
    ]]

    train_data.text = train_data.text.astype(str) \
        .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
        .apply(lambda x: re.sub('\s+', ' ', x))
    train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or
                                                   isint(x) or len(x) < 20)]

    val_data.text = val_data.text.astype(str) \
        .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
        .apply(lambda x: re.sub('\s+', ' ', x))
    val_data = val_data[~val_data.text.
                        apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

    train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
    train_labels = np.asarray(train_data['channel'], dtype='int8')

    val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
    val_labels = np.asarray(val_data['channel'], dtype='int8')

    vocab, vocab_size = create_vocab_set()

    X_train = text2sequence(train_text, vocab)
    X_val = text2sequence(val_text, vocab)

    X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, value=0)
    X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH, value=0)

    train_labels = to_categorical(train_labels, num_classes=12)
    val_labels = to_categorical(val_labels, num_classes=12)

    return X_train, train_labels, X_val, val_labels
Ejemplo n.º 3
0
def load_data():
    data = pd.read_csv('../data/train_set.csv',
                       usecols=range(1, 11),
                       parse_dates=['timestamp', 'thread_timestamp'])
    data = data[data.channel.isin([
        'career', 'big_data', 'deep_learning', 'kaggle_crackers',
        'lang_python', 'lang_r', 'nlp', 'theory_and_practice', 'welcome',
        'bayesian', '_meetings', 'datasets'
    ])
                & data.main_msg]

    users_100 = list(data.user_id.value_counts()[:100].index)
    data = data[data["user_id"].isin(users_100)]

    mappings = {}
    for c, value in enumerate(users_100, 0):
        mappings[value] = c

    # split on data and data val
    date_before = date(2017, 4, 1)
    train = data[data['timestamp'] <= date_before]
    val = data[data['timestamp'] > date_before]

    train_data = train[['user_id', 'text']].reset_index()[['user_id', 'text']]
    train_data['user_id'] = train_data.user_id.map(mappings)
    train_data = train_data.sort_values('user_id').reset_index()[[
        'user_id', 'text'
    ]]

    val_data = val[['user_id', 'text']].reset_index()[['user_id', 'text']]
    val_data['user_id'] = val_data.user_id.map(mappings)
    val_data = val_data.sort_values('user_id').reset_index()[[
        'user_id', 'text'
    ]]

    train_data.text = train_data.text.astype(str) \
        .apply(lambda x: re.sub('(<\S+>:?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
        .apply(lambda x: re.sub('\s+', ' ', x))
    train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or
                                                   isint(x) or len(x) < 20)]

    val_data.text = val_data.text.astype(str) \
        .apply(lambda x: re.sub('(<\S+>:?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
        .apply(lambda x: re.sub('\s+', ' ', x))
    val_data = val_data[~val_data.text.
                        apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

    train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
    train_labels = np.asarray(train_data['user_id'], dtype='int8')

    val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
    val_labels = np.asarray(val_data['user_id'], dtype='int8')
    return train_text, train_labels, val_text, val_labels
Ejemplo n.º 4
0
        def func(value):
            """
            Check if a value can be casted to a specific
            :param value: value to be checked
            :return:
            """
            if isinstance(value, bool):
                _data_type = "bool"
            elif fastnumbers.isint(value):  # Check if value is integer
                _data_type = "int"
            elif fastnumbers.isfloat(value):
                _data_type = "float"
            # if string we try to parse it to int, float or bool
            elif isinstance(value, str):
                if str_to_boolean(value):
                    _data_type = "bool"
                elif str_to_date(value):
                    _data_type = "date"
                elif str_to_array(value):
                    _data_type = "array"
                else:
                    _data_type = "string"
            else:
                _data_type = "null"

            if get_type is False:
                if _data_type == data_type:
                    return True
                else:
                    return False
            else:
                return _data_type
Ejemplo n.º 5
0
def _infer_type(value):
    #if not value or f4py.is_missing_value(value):
    #    return None
    if fastnumbers.isint(value):
        return b"i"
    if fastnumbers.isfloat(value):
        return b"f"
    return b"s"
Ejemplo n.º 6
0
def load_data_gbm():
    data = pd.read_csv(os.path.join(dir_train, 'train_set.csv'),
                       usecols=range(1, 11),
                       parse_dates=['timestamp', 'thread_timestamp'])
    data = data[data.channel.isin([
        'career', 'big_data', 'deep_learning', 'kaggle_crackers',
        'lang_python', 'lang_r', 'nlp', 'theory_and_practice', 'welcome',
        'bayesian', '_meetings', 'datasets'
    ]) & data.main_msg]

    date_before = date(2017, 4, 1)
    train = data[data['timestamp'] <= date_before]
    val = data[data['timestamp'] > date_before]

    train_data = train[['channel', 'text']].reset_index()[['channel', 'text']]
    train_data['channel'] = train_data.channel.map(mappings)
    train_data = train_data.sort_values('channel').reset_index()[[
        'channel', 'text'
    ]]

    val_data = val[['channel', 'text']].reset_index()[['channel', 'text']]
    val_data['channel'] = val_data.channel.map(mappings)
    val_data = val_data.sort_values('channel').reset_index()[[
        'channel', 'text'
    ]]

    train_data.text = train_data.text.astype(str) \
        .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
        .apply(lambda x: re.sub('\s+', ' ', x))
    train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or
                                                   isint(x) or len(x) < 20)]

    val_data.text = val_data.text.astype(str) \
        .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
        .apply(lambda x: re.sub('\s+', ' ', x))
    val_data = val_data[~val_data.text.
                        apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

    train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
    train_labels = np.asarray(train_data['channel'], dtype='int8')

    val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
    val_labels = np.asarray(val_data['channel'], dtype='int8')

    return train_text, val_text, train_labels, val_labels
Ejemplo n.º 7
0
def norm_val(val, empty_as_null: bool) -> Union[bytes, int, float, None]:
    """Normalize a value"""
    if val is None:
        return None

    if fastnumbers.isfloat(val) or fastnumbers.isint(val):
        return fastnumbers.float(val)

    val = val.strip()
    if len(val) == 0 and empty_as_null:
        return None

    return val.encode("utf-8", "ignore")
Ejemplo n.º 8
0
def infer(value):
    """
    Infer a Spark data type from a value
    :param value: value to be inferred
    :return: Spark data type
    """
    result = None
    # print(v)
    if value is None:
        result = "null"

    elif is_bool(value):
        result = "bool"

    elif isint(value):
        result = "int"

    elif isfloat(value):
        result = "float"

    elif is_list(value):
        result = ArrayType(infer(value[0]))

    elif is_datetime(value):
        result = "datetime"

    elif is_date(value):
        result = "date"

    elif is_binary(value):
        result = "binary"

    elif is_str(value):
        if str_to_boolean(value):
            result = "bool"
        elif str_to_date(value):
            result = "string"  # date
        elif str_to_array(value):
            result = "string"  # array
        else:
            result = "string"

    return get_spark_dtypes_object(result)
Ejemplo n.º 9
0
def to_spark(value):
    """
    Infer a Spark data type from a value
    :param value: value to be inferred
    :return: Spark data type
    """
    result = None
    if value is None:
        result = "null"

    elif is_bool_value(value):
        result = "bool"

    elif fastnumbers.isint(value):
        result = "int"

    elif fastnumbers.isfloat(value):
        result = "float"

    elif is_list_value(value):
        result = ArrayType(to_spark(value[0]))

    elif is_datetime(value):
        result = "datetime"

    elif is_date(value):
        result = "date"

    elif is_binary(value):
        result = "binary"

    elif is_str(value):
        if is_bool_str(value):
            result = "bool"
        elif is_datetime(value):
            result = "string"  # date
        elif is_list_str(value):
            result = "string"  # array
        else:
            result = "string"

    return parse_spark_class_dtypes(result)
def test_isint_returns_False_if_given_non_number_string(x):
    assume(not a_number(x))
    assert not fastnumbers.isint(x)
def test_isint_given_unicode_of_more_than_one_char_returns_False(x):
    assume(not a_number(x))
    assert not fastnumbers.isint(x)
def test_isint_given_unicode_non_numeral_returns_False(x):
    assert not fastnumbers.isint(x)
def test_isint_given_unicode_digit_returns_True(x):
    assert fastnumbers.isint(x)
    # Try padded as well
    assert fastnumbers.isint(u'   ' + x + u'   ')
def test_isint_returns_False_if_given_string_and_num_only_is_True(x):
    assert not fastnumbers.isint(repr(x), num_only=True)
Ejemplo n.º 15
0
def main(add_creds='A',
         base_name='B',
         no_color=False,
         compare=False,
         display='D',
         extended=False,
         from_file='F',
         no_grid=False,
         list=False,
         reuse_json=False,
         text_move='M',
         confidence='N',
         output_dir='O',
         quiet=False,
         relaxed=False,
         services='S',
         threads='T',
         version=False,
         text_color='X',
         text_size='Z',
         debug='OUT',
         *files):
    '''Handprint (a loose acronym of "HANDwritten Page RecognitIoN Test") runs
alternative text recognition services on images of handwritten document pages.

Installing credentials for cloud-based services
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

If given the command-line flag -l (or /l on Windows), Handprint will print a
list of the known services, and exit.

Before a given service can be used, if it is cloud-based commercial OCR/HTR
service, Handprint needs to be supplied with user credentials for accessing
that service.  The credentials must be stored in a JSON file with a certain
format; see the Handprint user documentation for details about the formats
for each service.  To add a new credentials file, use the -a option (/a on
Windows) in combination with the name of a service and a single file path on
the command line.  The name supplied right after the -a option must be the
name of a recognized service (such as "google", "amazon", "microsoft"), and
the file argument must be a JSON file containing the credentials data in the
required format for that service.  Here is an example of adding credentials
for Google (assuming you created the JSON file as described in the docs):

  handprint -a google mygooglecreds.json

Run Handprint with the -a option multiple times to install credentials for
each different service.  Handprint will copy the credential files to its own
configuration directory and exit without doing anything else.  The directory
is different on different operating sytems; for example, on macOS it
is ~/Library/Application Support/Handprint/.

Basic usage
~~~~~~~~~~~

After credentials are installed, running Handprint without the -a option will
invoke one or more OCR/HTR services on files, directories of files, or URLs.
Here is an example of running Handprint on a directory containing images:

  handprint tests/data/caltech-archives/glaser/

Image paths or URLs can be supplied to Handprint in any of the following ways:

 a) one or more directory paths or one or more image file paths, which will
    be interpreted as images (either individually or in directories) to be
    processed;

 b) one or more URLs, which will be interpreted as network locations of image
    files to be processed; or

 c) if given the -f option (/f on Windows), a file containing either image
    paths or image URLs.

Note that providing URLs on the command line can be problematic due to how
terminal shells interpret certain characters, and so when supplying URLs,
it's usually better to store the URLs in a file and use the -f option.
Regardless, when given URLs, Handprint will first download the images to a
local directory indicated by the option -o (/o on Windows), or the current
directory if option -o is not used.

No matter whether files or URLs, each input should be a single image of a
document page in which text should be recognized.  Handprint can accept input
images in JP2, JPEG, PDF, PNG, GIF, BMP, and TIFF formats. To make the
results from different services more easily comparable, Handprint will always
convert all input images to the same format (PNG) no matter if some services
may accept other formats; it will also downsize input images to the smallest
size accepted by any of the services invoked if an image exceeds that size.
(For example, if service A accepts files up to 10 MB in size and service B
accepts files up to 4 MB, all input images will be resized to 4 MB before
sending them to both A and B, even if A could accept a higher- resolution
image.)  Finally, if the input contains more than one page (e.g., in a PDF
file), Handprint will only use the first page and ignore the rest.

Be aware that resizing images to the lowest common size means that the text
recognition results returned by some services may be different than if the
original full-size input image had been sent to that service.  If your images
are larger (when converted to PNG) than the size threshold for some services
(which is currently 4 MB when Microsoft is one of the destinations), then you
may wish to compare the results of using multiple services at once versus
using the services one at a time.

Selecting destination services
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The default action is to run all known services.  The option -s (/s on
Windows) can be used to select only one service or a list of services
instead.  Lists of services should be separated by commas; e.g.,
"google,microsoft".  To find out which services are supported by Handprint, run
it with the command-line flag -l (or /l on Windows), which will make Handprint
print a list of the known services and exit immediately.

Visual display of recognition results
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

After gathering the results of each service for a given input, Handprint will
create a single compound image consisting of the results for each service
arranged in a grid.  This is intended to make it easier to compare the results
of multiple services against each other.  To skip the creation of the results
grid, use the -G option (/G on Windows).  The grid image have a name with
the following pattern:

  somefile.handprint-all.png

If given the -e option (/e on Windows), Handprint will produce extended
output that includes the complete response from the service (converted to a
JSON file by Handprint) and the text extracted (stored as a .txt file).  The
output of -e will be multiple files like this:

  somefile.handprint-amazon-rekognition.json
  somefile.handprint-amazon-rekognition.png
  somefile.handprint-amazon-rekognition.txt
  somefile.handprint-amazon-textract.json
  somefile.handprint-amazon-textract.png
  somefile.handprint-amazon-textract.txt
  somefile.handprint-google.json
  somefile.handprint-google.png
  somefile.handprint-google.txt
  somefile.handprint-microsoft.json
  somefile.handprint-microsoft.png
  somefile.handprint-microsoft.txt
  ...

The files will be written to the directory indicated by -o, or (if -o is not
used) the directory where "somefile" is located.  When -o is not used and
the input images are given as URLs, then the files are written to the current
working directory instead.

When the inputs are URLs, Handprint must download a copy of the image located
at the network address (because it is not possible to write the results in
the network locations represented by the URLs.).  The images and other
results will be stored files whose root names have the form "document-N",
where "N" is an integer.  The root name can be changed using the -b option
(/b on Windows).  The image at networked locations will be converted to
ordinary PNG format for maximum compatibility with the different OCR
services and written to "document-N.png", and the URL corresponding to each
document will be written in a file named "document-N.url" so that it is
possible to connect each "document-N.png" to the URL it came from.

Finally, note that the use of the -G option (/G on Windows) WITHOUT either
the -e or -c option is an error because it means no output would be produced.

Type of annotations
~~~~~~~~~~~~~~~~~~~

Handprint produces copies of the input images overlayed with the recognition
results received from the different services.  By default, it shows only the
recognized text.  The option -d (/d on Windows) can be used to tell Handprint
to display other results.  The recognized values are as follows:

  text    -- display the text recognized in the image (default)
  bb      -- display all bounding boxes returned by the service
  bb-word -- display only the bounding boxes for words (in red)
  bb-line -- display only the bounding boxes for lines (in blue)
  bb-para -- display only the bounding boxes for paragraphs (in green)

Separate multiple values with a comma.  The option "bb" is a shorthand for the
value "bb-word,bb-line,bb-para".  As an example, the following command will
show both the recognized text and the bounding boxes around words:

  handprint -d text,bb-word  somefile.png

Note that as of June 2021, the main services (Amazon, Google, Microsoft) do not
all provide the same bounding box information in their results.  The following
table summarizes what is available:

               Bounding boxes available
  Service      Word    Line   Paragraph
  ---------    ----    ----   ---------
  Amazon         Y       Y        -
  Google         Y       -        Y
  Microsoft      Y       Y        -

If a service does not provide a particular kind of bounding box, Handprint will
not display that kind of bounding box in the annotated output for that service.

Thresholding by confidence
~~~~~~~~~~~~~~~~~~~~~~~~~~

All of the services return confidence scores for items recognized in the input.
By default, Handprint will show all results in the annotated image, no matter
how low the score.  The option -n (/n on Windows) can be used to threshold the
results based on the confidence value for each item (text or bounding boxes).
The value provided as the argument to -n must be a floating point number
between 0 and 1.0.  For example, the following command will make Handprint only
show text that is rated with least 99.5% confidence:

  handprint -n 0.995  somefile.png

Note that the confidence values returned by the different services are not
normalized against each other.  What one service considers to be 80% confidence
may not be what another service considers 80% confidence.  Handprint performs
the thresholding against the raw scores returned by each service individually.

Comparing results to expected output
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Handprint supports comparing the output of HTR services to expected output
(i.e., ground truth) using the option -c (or /c on Windows).  This facility
requires that the user provides text files that contain the expected text
for each input image.  The ground-truth text files must have the following
characteristics:

 a) The file containing the expected results should be named ".gt.txt", with
    a base name identical to the image file.  For example, an image file named
    "somefile.jpg" should have a corresponding text file "somefile.gt.txt".

 b) The ground-truth text file should be located in the same directory as the
    input image file.

 c) The text should be line oriented, with each line representing a line of
    text in the image.

 d) The text should be plain text only.  No Unicode or binary encodings.
    (This limitation comes from the HTR services, which -- as of this
    writing -- return results in plain text format.)

Handprint will write the comparison results to a tab-delimited file named
after the input image and service but with the extension ".tsv".  For
example, for an input image "somefile.jpg" and results received from Google,
the comparison results will be written to "somefile.handprint-google.tsv".
(The use of a tab-delimited format rather than comma-delimited format avoids
the need to quote commas and other characters in the text.)

Handprint reports, for each text line, the number of errors (the Levenshtein
edit distance) and the character error rate (CER), and at the end it also
reports a sum total of errors.  The CER is computed as the Levenshtein edit
distance of each line divided by the number of characters in the expected
line text, multiplied by 100; this approach to normalizing the CER value is
conventional but note that it can lead to values greater than 100%.

By default, comparisons are done on an exact basis; character case is not
changed, punctuation is not removed, and stop words are not removed.
However, multiple contiguous spaces are converted to one space, and leading
spaces are removed from text lines.  If given the option -r (/r on Windows),
Handprint will relax the comparison algorithm as follows:

 i) convert all text to lower case
 ii) ignore certain sentence punctuation characters, namely , . : ;

Handprint attempts to cope with possibly-missing text in the HTR results by
matching up likely corresponding lines in the expected and received results.
It does this by comparing each line of ground-truth text to each line of the
HTR results using longest common subsequence similarity, as implemented by
the LCSSEQ function in the Python "textdistance" package.  If the lines do
not pass a threshold score, Handprint looks at subsequent lines of the HTR
results and tries to reestablish correspondence to ground truth.  If nothing
else in the HTR results appear close enough to the expected ground-truth
line, the line is assumed to be missing from the HTR results and scored
appropriately.

Additional command-line arguments
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The option -j (/j on Windows) tells Handprint to look for and reuse preexisting
results for each input instead of contacting the services.  This makes it look
for JSON files produced in a previous run with the -e option,

  somefile.handprint-amazon-rekognition.json
  somefile.handprint-amazon-textract.json
  somefile.handprint-google.json
  somefile.handprint-microsoft.json

and use those instead of getting results from the services.  This can be useful
to save repeated invocations of the services if all you want is to draw the
results differently or perform some testing/debugging on the same inputs.

To move the position of the text annotations overlayed over the input image,
you can use the option -m (or /m on Windows).  This takes two numbers separated
by a comma in the form x,y.  Positive numbers move the text rightward and
upward, respectively, relative to the default position.  The default position
of each text annotation in the annotated output is such that the left edge of
the word starts at the location of the upper left corner of the bounding box
returned by the service; this has the effect of putting the annotation near,
but above, the location of the (actual) word in the input image by default.
Using the text-move option allows you to move the annotation if desired.

To change the color of the text annotations overlayed over the input image,
you can use the option -x (or /x on Windows).  You can use hex color codes
such as "#ff0000" or X11/CSS4 color names with no spaces such as "purple"
or "darkgreen".  If you use a hex value, make sure to enclose the value with
quotes, or the shell will interpret the pound sign as a comment character.

To change the size of the text annotations overlayed over the input image,
you can use the option -z (or /z on Windows).  The value is in units of points.
The default size is 12 points.

Handprint will send files to the different services in parallel, using a
number of process threads at most equal to 1/2 of the number of cores on the
computer it is running on.  (E.g., if your computer has 4 cores, it will by
default use at most 2 threads.)  The option -t (/t on Windows) can be used to
change this number.

If given the -q option (/q on Windows), Handprint will not print its usual
informational messages while it is working.  It will only print messages
for warnings or errors.  By default messages printed by Handprint are also
color-coded.  If given the option -Z (/Z on Windows), Handprint will not color
the text of messages it prints.  (This latter option is useful when running
Handprint within subshells inside other environments such as Emacs.)

If given the -@ argument (/@ on Windows), this program will output a detailed
trace of what it is doing.  The debug trace will be sent to the given
destination, which can be '-' to indicate console output, or a file path to
send the output to a file.

When -@ (or /@ on Windows) has been given, Handprint installs a signal handler
on signal SIGUSR1 that will drop Handprint into the pdb debugger if the signal
is sent to the running process.  It's best to use -t 1 when attempting to use
a debugger because the subthreads will not stop running if the signal is sent.

If given the -V option (/V on Windows), this program will print the version
and other information, and exit without doing anything else.

Return values
~~~~~~~~~~~~~

This program exits with a return code of 0 if no problems are encountered.
It returns a nonzero value otherwise. The following table lists the possible
return values:

    0 = success -- program completed normally
    1 = the user interrupted the program's execution
    2 = encountered a bad or missing value for an option
    3 = no network detected -- cannot proceed
    4 = file error -- encountered a problem with a file
    5 = server error -- encountered a problem with a server
    6 = an exception or fatal error occurred

Command-line arguments summary
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
'''

    # Initial setup -----------------------------------------------------------

    pref = '/' if sys.platform.startswith('win') else '-'
    hint = f'(Hint: use {pref}h for help.)'
    ui = UI('Handprint',
            'HANDwritten Page RecognitIoN Test',
            use_color=not no_color,
            be_quiet=quiet,
            show_banner=not (version or list or add_creds != 'A'))
    ui.start()

    if debug != 'OUT':
        if __debug__: set_debug(True, debug, extra='%(threadName)s')
        import faulthandler
        faulthandler.enable()
        if not sys.platform.startswith('win'):
            # Even with a different signal, I can't get this to work on Win.
            pdb_on_signal(signal.SIGUSR1)

    # Preprocess arguments and handle early exits -----------------------------

    if version:
        print_version()
        exit(int(ExitCode.success))
    if list:
        inform('Known services: [bold]{}[/]', ', '.join(services_list()))
        exit(int(ExitCode.success))
    if add_creds != 'A':
        service = add_creds.lower()
        if service not in services_list():
            alert(f'Unknown service: "{service}". {hint}')
            exit(int(ExitCode.bad_arg))
        if not files or len(files) > 1:
            alert(f'Option {pref}a requires one file. {hint}')
            exit(int(ExitCode.bad_arg))
        creds_file = files[0]
        if not readable(creds_file):
            alert(f'File not readable: {creds_file}')
            exit(int(ExitCode.file_error))
        Credentials.save_credentials(service, creds_file)
        inform(f'Saved credentials for service "{service}".')
        exit(int(ExitCode.success))
    services = services_list() if services == 'S' else services.lower().split(
        ',')
    if services != 'S' and not all(s in services_list() for s in services):
        alert_fatal(f'"{services}" is/are not known services. {hint}')
        exit(int(ExitCode.bad_arg))
    display_given = display
    display = ['text'] if display == 'D' else display.lower().split(',')
    possible_displays = [
        'text', 'bb', 'bb-word', 'bb-words', 'bb-line', 'bb-lines', 'bb-para',
        'bb-paragraph', 'bb-paragraphs'
    ]
    if not all(d in possible_displays for d in display):
        alert_fatal(f'Unrecognized value for {pref}d: {display_given}. {hint}')
        exit(int(ExitCode.bad_arg))
    if no_grid and not extended and not compare:
        alert_fatal(
            f'{pref}G without {pref}e or {pref}c produces no output. {hint}')
        exit(int(ExitCode.bad_arg))
    if any(item.startswith('-') for item in files):
        bad = next(item for item in files if item.startswith('-'))
        alert_fatal(f'Unrecognized option "{bad}" in arguments. {hint}')
        exit(int(ExitCode.bad_arg))
    if not files and from_file == 'F':
        alert_fatal(f'Need images or URLs to have something to do. {hint}')
        exit(int(ExitCode.bad_arg))
    if relaxed and not compare:
        warn(f'Option {pref}r without {pref}c has no effect. {hint}')
    if text_move != 'M' and ',' not in text_move:
        alert_fatal(
            f'Option {pref}m requires an argument of the form x,y. {hint}')
        exit(int(ExitCode.bad_arg))
    if text_size != 'Z' and not isint(text_size):
        alert_fatal(
            f'Option {pref}z requires an integer as an argument. {hint}')
        exit(int(ExitCode.bad_arg))
    if confidence != 'N':
        if not isreal(confidence):
            alert_fatal(
                f'Option {pref}n requires a real number as an argument. {hint}'
            )
            exit(int(ExitCode.bad_arg))
        confidence = fast_real(confidence)
        if not (0 <= confidence <= 1.0):
            alert_fatal(
                f'Option {pref}n requires a real number between 0 and 1.0. {hint}'
            )
            exit(int(ExitCode.bad_arg))

    # Do the real work --------------------------------------------------------

    if __debug__: log('=' * 8 + f' started {timestamp()} ' + '=' * 8)
    body = exception = None
    try:
        body = MainBody(
            files=files,
            from_file=None if from_file == 'F' else from_file,
            output_dir=None if output_dir == 'O' else output_dir,
            add_creds=None if add_creds == 'A' else add_creds,
            base_name='document' if base_name == 'B' else base_name,
            confidence=0 if confidence == 'N' else confidence,
            text_color='red' if text_color == 'X' else text_color.lower(),
            text_shift='0,0' if text_move == 'M' else text_move,
            text_size='12' if text_size == 'Z' else int(text_size),
            display=display,
            make_grid=not no_grid,
            extended=extended,
            reuse_json=reuse_json,
            services=services,
            threads=max(1,
                        cpu_count() // 2 if threads == 'T' else int(threads)),
            compare='relaxed' if (compare and relaxed) else compare)
        config_interrupt(body.stop, UserCancelled(ExitCode.user_interrupt))
        body.run()
        exception = body.exception
    except Exception as ex:
        exception = sys.exc_info()

    # Try to deal with exceptions gracefully ----------------------------------

    exit_code = ExitCode.success
    if exception:
        if exception[0] == CannotProceed:
            exit_code = exception[1].args[0]
        elif exception[0] in [KeyboardInterrupt, UserCancelled]:
            if __debug__: log(f'received {exception.__class__.__name__}')
            warn('Interrupted.')
            exit_code = ExitCode.user_interrupt
        else:
            ex_class = exception[0]
            ex = exception[1]
            alert_fatal(f'An error occurred ({ex_class.__name__}): {str(ex)}')
            # Return a better error code for some common cases.
            if ex_class in [
                    FileNotFoundError, FileExistsError, PermissionError
            ]:
                exit_code = ExitCode.file_error
            else:
                exit_code = ExitCode.exception
            if __debug__:
                from traceback import format_exception
                details = ''.join(format_exception(*exception))
                logr(f'Exception: {str(ex)}\n{details}')
    else:
        inform('Done.')

    # And exit ----------------------------------------------------------------

    if __debug__: log('_' * 8 + f' stopped {timestamp()} ' + '_' * 8)
    if exit_code == ExitCode.user_interrupt:
        # This is a sledgehammer, but it kills everything, including ongoing
        # network get/post. I have not found a more reliable way to interrupt.
        os._exit(int(exit_code))
    else:
        exit(int(exit_code))
Ejemplo n.º 16
0
date_before = date(2017, 4, 1)
train = data[data['timestamp'] <= date_before]
val = data[data['timestamp'] > date_before]

train_data = train[['channel', 'text']].reset_index()[['channel', 'text']]
train_data['channel'] = train_data.channel.map(mappings)
train_data = train_data.sort_values('channel').reset_index()[[
    'channel', 'text'
]]

val_data = val[['channel', 'text']].reset_index()[['channel', 'text']]
val_data['channel'] = val_data.channel.map(mappings)
val_data = val_data.sort_values('channel').reset_index()[['channel', 'text']]

train_data = train_data[~train_data.text.
                        apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]
val_data = val_data[~val_data.text.
                    apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
train_labels = np.asarray(train_data['channel'], dtype='int8')

val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
val_labels = np.asarray(val_data['channel'], dtype='int8')

train_text = train_text \
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
    .apply(lambda x: re.sub('\s+', ' ', x))

val_text = val_text \
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
def test_isint_returns_False_if_given_int_and_str_only_is_True(x):
    assert not fastnumbers.isint(x, str_only=True)
def test_isint_returns_False_if_given_float(x):
    assert not fastnumbers.isint(x)
def test_isint_returns_True_if_given_int(x):
    assert fastnumbers.isint(x, num_only=True)
def test_isint():
    # 1. float number
    assert not fastnumbers.isint(-367.3268)
    # 2. signed float string
    assert not fastnumbers.isint("+367.3268")
    # 3. float string with exponents
    assert not fastnumbers.isint("-367.3268e207")
    # 4. float string with padded whitespace
    assert not fastnumbers.isint("   -367.04   ")
    # 5. int number
    assert fastnumbers.isint(499)
    assert not fastnumbers.isint(499, str_only=True)
    assert fastnumbers.isint(499, num_only=True)
    # 6. signed int string
    assert fastnumbers.isint("-499")
    assert fastnumbers.isint("-499", True)
    assert not fastnumbers.isint("-499", num_only=True)
    # 7. int string with padded whitespace
    assert fastnumbers.isint("   +3001   ")
    # 8. long number
    assert fastnumbers.isint(35892482945872302493)
    # 9. long string
    assert fastnumbers.isint("35892482945872302493")
    # 10. return type
    assert fastnumbers.isint(4029) is True
    assert fastnumbers.isint(4029, str_only=True) is False
    assert fastnumbers.isint("4029") is True
    assert fastnumbers.isint("4029", True) is True
    assert fastnumbers.isint("4029.50") is False
    assert fastnumbers.isint(4029.50) is False
    # 11. TypeError for invalid input
    assert not fastnumbers.isint(["hey"])
    # 12. Invalid input string
    assert not fastnumbers.isint("not_a_number")
    # 13. Invalid input string with numbers
    assert not fastnumbers.isint("26.8 lb")
    # 14. Infinity
    assert not fastnumbers.isint("inf")
    # 15. NaN
    assert not fastnumbers.isint("nan")
    # 16. Sign/'e'/'.' only
    assert not fastnumbers.isint("+")
    assert not fastnumbers.isint("-")
    assert not fastnumbers.isint("e")
    assert not fastnumbers.isint(".")
    # 18. Unicode numbers
    assert fastnumbers.isint(u"⑦")
    assert fastnumbers.isint(u"⁸")
    assert not fastnumbers.isint(u"⅔")
    assert not fastnumbers.isint(u"Ⅴ")
Ejemplo n.º 21
0
    def parse(value,
              infer: bool = False,
              dtypes=None,
              str_funcs=None,
              int_funcs=None):
        """

        :param value:
        :param infer: If 'True' try to infer in all the dataTypes available. See int_func and str_funcs
        :param dtypes:
        :param str_funcs: Custom string function to infer.
        :param int_funcs: Custom numeric functions to infer.
        {col_name: regular_expression}
        :return:
        """
        col_name, value = value

        # Try to order the functions from less to more computational expensive
        if int_funcs is None:
            int_funcs = [(str_to_credit_card, "credit_card_number"),
                         (str_to_zip_code, "zip_code")]

        if str_funcs is None:
            str_funcs = [(str_to_missing, "missing"),
                         (str_to_boolean, "boolean"), (str_to_date, "date"),
                         (str_to_array, "array"), (str_to_object, "object"),
                         (str_to_ip, "ip"), (str_to_url, "url"),
                         (str_to_email, "email"), (str_to_gender, "gender"),
                         (str_to_null, "null")]

        if dtypes[col_name] == "string" and infer is True:

            if isinstance(value, bool):
                _data_type = "boolean"

            elif fastnumbers.isint(value):  # Check if value is integer
                _data_type = "int"
                for func in int_funcs:
                    if func[0](value) is True:
                        _data_type = func[1]
                        break

            elif fastnumbers.isfloat(value):
                _data_type = "decimal"

            elif isinstance(value, str):
                _data_type = "string"
                for func in str_funcs:
                    if func[0](value) is True:
                        _data_type = func[1]
                        break
            else:
                _data_type = "null"

        else:
            _data_type = dtypes[col_name]
            if is_null(value) is True:
                _data_type = "null"
            elif str_to_missing(value) is True:
                _data_type = "missing"
            else:
                if dtypes[col_name].startswith("array"):
                    _data_type = "array"
                else:
                    _data_type = dtypes[col_name]

        result = (col_name, _data_type), 1

        return result
def test_isint():
    # 1. float number
    assert not fastnumbers.isint(-367.3268)
    # 2. signed float string
    assert not fastnumbers.isint("+367.3268")
    # 3. float string with exponents
    assert not fastnumbers.isint("-367.3268e207")
    # 4. float string with padded whitespace
    assert not fastnumbers.isint("   -367.04   ")
    # 5. int number
    assert fastnumbers.isint(499)
    assert not fastnumbers.isint(499, str_only=True)
    assert fastnumbers.isint(499, num_only=True)
    # 6. signed int string
    assert fastnumbers.isint('-499')
    assert fastnumbers.isint('-499', True)
    assert not fastnumbers.isint('-499', num_only=True)
    # 7. int string with padded whitespace
    assert fastnumbers.isint('   +3001   ')
    # 8. long number
    assert fastnumbers.isint(35892482945872302493)
    # 9. long string
    assert fastnumbers.isint("35892482945872302493")
    # 10. return type
    assert fastnumbers.isint(4029) is True
    assert fastnumbers.isint(4029, str_only=True) is False
    assert fastnumbers.isint("4029") is True
    assert fastnumbers.isint("4029", True) is True
    assert fastnumbers.isint("4029.50") is False
    assert fastnumbers.isint(4029.50) is False
    # 11. TypeError for invalid input
    assert not fastnumbers.isint(['hey'])
    # 12. Invalid input string
    assert not fastnumbers.isint('not_a_number')
    # 13. Invalid input string with numbers
    assert not fastnumbers.isint('26.8 lb')
    # 14. Infinity
    assert not fastnumbers.isint('inf')
    # 15. NaN
    assert not fastnumbers.isint('nan')
    # 16. Sign/'e'/'.' only
    assert not fastnumbers.isint('+')
    assert not fastnumbers.isint('-')
    assert not fastnumbers.isint('e')
    assert not fastnumbers.isint('.')
    # 18. Unicode numbers
    assert fastnumbers.isint(u'⑦')
    assert fastnumbers.isint(u'⁸')
    assert not fastnumbers.isint(u'⅔')
    assert not fastnumbers.isint(u'Ⅴ')
Ejemplo n.º 23
0
def str_to_int(_value):
    return True if fastnumbers.isint(_value) else False
def test_isint_returns_False_for_nan_or_inf_string():
    assert not fastnumbers.isint('nan')
    assert not fastnumbers.isint('inf')
def test_isint_with_no_arguments_fails():
    with raises(TypeError):
        fastnumbers.isint(5, invalid='dummy')
def test_isint_returns_True_if_given_int_string_padded_or_not(x, y, z):
    y = ''.join(repeat(' ', y)) + repr(x) + ''.join(repeat(' ', z))
    assert fastnumbers.isint(repr(x)) is True
    assert fastnumbers.isint(repr(x), str_only=True)
    assert fastnumbers.isint(y)
    for base in range(2, 36 + 1):
        if len(repr(x)) < 30:  # Avoid recursion error because of overly simple baseN function.
            assert fastnumbers.isint(baseN(x, base), base=base)
    assert fastnumbers.isint(bin(x), base=2)
    assert fastnumbers.isint(bin(x), base=0)
    assert fastnumbers.isint(oct(x), base=8)
    assert fastnumbers.isint(oct(x), base=0)
    if python_version_tuple()[0] == '2':
        assert fastnumbers.isint(oct(x).replace('0o', '0'), base=8)
        assert fastnumbers.isint(oct(x).replace('0o', '0'), base=0)
    assert fastnumbers.isint(hex(x), base=16)
    assert fastnumbers.isint(hex(x), base=0)