Example #1
0
def get_item_sink(output_file):
    fh = get_file_handle(output_file, 'w')

    if output_file.endswith('.csv'):
        set_max_field_size_limit()

        writer = None

        def sink(item):
            nonlocal writer
            if writer is None:
                fields = list(six.iterkeys(item))
                writer = csv.DictWriter(fh,
                                        fieldnames=fields,
                                        extrasaction='ignore')
                writer.writeheader()
            writer.writerow(item)
    else:

        def sink(item):
            fh.write(json.dumps(item) + '\n')

    try:
        yield sink
    finally:
        fh.close()
Example #2
0
def extract_csv_column(input, output, column):
    """Extracts column from given CSV file. Deprecated - use extract_field."""
    set_max_field_size_limit()

    with smart_open(input, 'r') as input_file, smart_open(output, 'w') as output_file:
        reader = csv.DictReader(input_file)
        for row in reader:
            output_file.write(row[column] + '\n')
Example #3
0
def extract_csv_column_unique(input, output, column):
    set_max_field_size_limit()

    with smart_open(input, 'r') as input_file, smart_open(output,
                                                          'w') as output_file:
        reader = csv.DictReader(input_file)
        seen = set()  # set for fast O(1) amortized lookup
        for row in reader:
            if row[column] in seen:
                continue
            seen.add(row[column])
            output_file.write(row[column] + '\n')
Example #4
0
def get_item_iterable(input_file):
    fh = get_file_handle(input_file, 'r')

    if input_file.endswith('.csv'):
        set_max_field_size_limit()
        reader = csv.DictReader(fh)
    else:
        reader = (json.loads(line) for line in fh)

    try:
        yield reader
    finally:
        fh.close()
Example #5
0
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import argparse
import csv

from ethereumetl.csv_utils import set_max_field_size_limit
from ethereumetl.file_utils import smart_open

parser = argparse.ArgumentParser(description='Extracts a single column from a given csv file.')
parser.add_argument('-i', '--input', default='-', type=str, help='The input file. If not specified stdin is used.')
parser.add_argument('-o', '--output', default='-', type=str, help='The output file. If not specified stdout is used.')
parser.add_argument('-c', '--column', required=True, type=str, help='The csv column name to extract.')

args = parser.parse_args()

set_max_field_size_limit()

with smart_open(args.input, 'r') as input_file, smart_open(args.output, 'w') as output_file:
    reader = csv.DictReader(input_file)
    for row in reader:
        output_file.write(row[args.column] + '\n')